You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2020/12/18 12:11:03 UTC
[lucene-solr] 01/02: LUCENE-9570: code reformatting [partial].
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 6faa4f98e04817bf27d02d96332904a29fb7db69
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Fri Dec 18 13:02:08 2020 +0100
LUCENE-9570: code reformatting [partial].
---
gradle/validation/spotless.gradle | 21 +-
.../apache/lucene/analysis/ar/ArabicAnalyzer.java | 90 +-
.../analysis/ar/ArabicNormalizationFilter.java | 10 +-
.../ar/ArabicNormalizationFilterFactory.java | 7 +-
.../lucene/analysis/ar/ArabicNormalizer.java | 77 +-
.../lucene/analysis/ar/ArabicStemFilter.java | 24 +-
.../analysis/ar/ArabicStemFilterFactory.java | 7 +-
.../apache/lucene/analysis/ar/ArabicStemmer.java | 91 +-
.../apache/lucene/analysis/ar/package-info.java | 6 +-
.../collation/CollationAttributeFactory.java | 90 +-
.../lucene/collation/CollationDocValuesField.java | 27 +-
.../lucene/collation/CollationKeyAnalyzer.java | 84 +-
.../org/apache/lucene/collation/package-info.java | 122 +-
.../tokenattributes/CollatedTermAttributeImpl.java | 10 +-
.../lucene/analysis/ar/TestArabicAnalyzer.java | 90 +-
.../lucene/analysis/ar/TestArabicFilters.java | 61 +-
.../analysis/ar/TestArabicNormalizationFilter.java | 56 +-
.../lucene/analysis/ar/TestArabicStemFilter.java | 98 +-
.../collation/TestCollationDocValuesField.java | 47 +-
.../lucene/collation/TestCollationKeyAnalyzer.java | 43 +-
.../lucene/analysis/standard/StandardAnalyzer.java | 54 +-
.../analysis/standard/StandardTokenizer.java | 94 +-
.../standard/StandardTokenizerFactory.java | 13 +-
.../lucene/analysis/standard/package-info.java | 23 +-
.../tokenattributes/BytesTermAttribute.java | 9 +-
.../tokenattributes/BytesTermAttributeImpl.java | 13 +-
.../tokenattributes/CharTermAttribute.java | 101 +-
.../tokenattributes/CharTermAttributeImpl.java | 98 +-
.../analysis/tokenattributes/FlagsAttribute.java | 20 +-
.../tokenattributes/FlagsAttributeImpl.java | 13 +-
.../analysis/tokenattributes/KeywordAttribute.java | 23 +-
.../tokenattributes/KeywordAttributeImpl.java | 12 +-
.../analysis/tokenattributes/OffsetAttribute.java | 41 +-
.../tokenattributes/OffsetAttributeImpl.java | 26 +-
.../tokenattributes/PackedTokenAttributeImpl.java | 93 +-
.../analysis/tokenattributes/PayloadAttribute.java | 23 +-
.../tokenattributes/PayloadAttributeImpl.java | 29 +-
.../PositionIncrementAttribute.java | 46 +-
.../PositionIncrementAttributeImpl.java | 21 +-
.../tokenattributes/PositionLengthAttribute.java | 32 +-
.../PositionLengthAttributeImpl.java | 23 +-
.../tokenattributes/TermFrequencyAttribute.java | 8 +-
.../TermFrequencyAttributeImpl.java | 21 +-
.../tokenattributes/TermToBytesRefAttribute.java | 26 +-
.../analysis/tokenattributes/TypeAttribute.java | 15 +-
.../tokenattributes/TypeAttributeImpl.java | 17 +-
.../analysis/tokenattributes/package-info.java | 4 +-
.../analysis/standard/TestStandardAnalyzer.java | 638 ++++--
.../analysis/standard/TestStandardFactories.java | 38 +-
.../tokenattributes/TestBytesRefAttImpl.java | 1 -
.../tokenattributes/TestCharTermAttributeImpl.java | 182 +-
.../TestPackedTokenAttributeImpl.java | 77 +-
.../tokenattributes/TestSimpleAttributeImpl.java | 43 +-
.../lucene/search/highlight/DefaultEncoder.java | 10 +-
.../apache/lucene/search/highlight/Encoder.java | 11 +-
.../apache/lucene/search/highlight/Formatter.java | 12 +-
.../apache/lucene/search/highlight/Fragmenter.java | 21 +-
.../lucene/search/highlight/GradientFormatter.java | 360 ++--
.../lucene/search/highlight/Highlighter.java | 417 ++--
.../highlight/InvalidTokenOffsetsException.java | 12 +-
.../search/highlight/LimitTokenOffsetFilter.java | 5 +-
.../lucene/search/highlight/NullFragmenter.java | 9 +-
.../search/highlight/OffsetLimitTokenFilter.java | 15 +-
.../lucene/search/highlight/PositionSpan.java | 3 +-
.../lucene/search/highlight/QueryScorer.java | 92 +-
.../search/highlight/QueryTermExtractor.java | 80 +-
.../lucene/search/highlight/QueryTermScorer.java | 52 +-
.../org/apache/lucene/search/highlight/Scorer.java | 38 +-
.../lucene/search/highlight/SimpleFragmenter.java | 20 +-
.../lucene/search/highlight/SimpleHTMLEncoder.java | 66 +-
.../search/highlight/SimpleHTMLFormatter.java | 13 +-
.../search/highlight/SimpleSpanFragmenter.java | 21 +-
.../search/highlight/SpanGradientFormatter.java | 33 +-
.../search/highlight/TermVectorLeafReader.java | 79 +-
.../lucene/search/highlight/TextFragment.java | 48 +-
.../apache/lucene/search/highlight/TokenGroup.java | 21 +-
.../lucene/search/highlight/TokenSources.java | 136 +-
.../highlight/TokenStreamFromTermVector.java | 124 +-
.../lucene/search/highlight/WeightedSpanTerm.java | 15 +-
.../highlight/WeightedSpanTermExtractor.java | 266 +--
.../lucene/search/highlight/WeightedTerm.java | 46 +-
.../lucene/search/highlight/package-info.java | 70 +-
.../BreakIteratorShrinkingAdjuster.java | 4 +-
.../matchhighlight/CharSequenceIterator.java | 4 +-
.../matchhighlight/FieldValueHighlighters.java | 76 +-
.../search/matchhighlight/MatchHighlighter.java | 147 +-
.../matchhighlight/MatchRegionRetriever.java | 109 +-
.../lucene/search/matchhighlight/OffsetRange.java | 9 +-
.../matchhighlight/OffsetsFromMatchIterator.java | 10 +-
.../matchhighlight/OffsetsFromPositions.java | 13 +-
.../search/matchhighlight/OffsetsFromTokens.java | 44 +-
.../search/matchhighlight/OffsetsFromValues.java | 22 +-
.../matchhighlight/OffsetsRetrievalStrategy.java | 18 +-
.../OffsetsRetrievalStrategySupplier.java | 8 +-
.../lucene/search/matchhighlight/Passage.java | 5 +-
.../search/matchhighlight/PassageAdjuster.java | 6 +-
.../search/matchhighlight/PassageFormatter.java | 31 +-
.../search/matchhighlight/PassageSelector.java | 27 +-
.../lucene/search/matchhighlight/package-info.java | 14 +-
.../search/uhighlight/AnalysisOffsetStrategy.java | 93 +-
.../lucene/search/uhighlight/CharArrayMatcher.java | 10 +-
.../uhighlight/CustomSeparatorBreakIterator.java | 9 +-
.../search/uhighlight/DefaultPassageFormatter.java | 27 +-
.../lucene/search/uhighlight/FieldHighlighter.java | 88 +-
.../search/uhighlight/FieldOffsetStrategy.java | 93 +-
.../uhighlight/LabelledCharArrayMatcher.java | 72 +-
.../search/uhighlight/LengthGoalBreakIterator.java | 83 +-
.../uhighlight/MemoryIndexOffsetStrategy.java | 48 +-
.../search/uhighlight/MultiTermHighlighting.java | 20 +-
.../search/uhighlight/NoOpOffsetStrategy.java | 17 +-
.../lucene/search/uhighlight/OffsetsEnum.java | 156 +-
.../OverlaySingleDocTermsLeafReader.java | 13 +-
.../apache/lucene/search/uhighlight/Passage.java | 70 +-
.../lucene/search/uhighlight/PassageFormatter.java | 18 +-
.../lucene/search/uhighlight/PassageScorer.java | 56 +-
.../lucene/search/uhighlight/PhraseHelper.java | 184 +-
.../search/uhighlight/PostingsOffsetStrategy.java | 7 +-
.../PostingsWithTermVectorsOffsetStrategy.java | 7 +-
.../search/uhighlight/SplittingBreakIterator.java | 39 +-
.../uhighlight/TermVectorFilteredLeafReader.java | 29 +-
.../uhighlight/TermVectorOffsetStrategy.java | 12 +-
.../uhighlight/TokenStreamOffsetStrategy.java | 17 +-
.../lucene/search/uhighlight/UHComponents.java | 26 +-
.../search/uhighlight/UnifiedHighlighter.java | 644 +++---
.../search/uhighlight/WholeBreakIterator.java | 6 +-
.../lucene/search/uhighlight/package-info.java | 4 +-
.../vectorhighlight/BaseFragListBuilder.java | 132 +-
.../vectorhighlight/BaseFragmentsBuilder.java | 314 +--
.../search/vectorhighlight/BoundaryScanner.java | 10 +-
.../BreakIteratorBoundaryScanner.java | 13 +-
.../vectorhighlight/FastVectorHighlighter.java | 261 ++-
.../search/vectorhighlight/FieldFragList.java | 100 +-
.../search/vectorhighlight/FieldPhraseList.java | 338 ++--
.../lucene/search/vectorhighlight/FieldQuery.java | 389 ++--
.../search/vectorhighlight/FieldTermStack.java | 134 +-
.../search/vectorhighlight/FragListBuilder.java | 8 +-
.../search/vectorhighlight/FragmentsBuilder.java | 63 +-
.../ScoreOrderFragmentsBuilder.java | 50 +-
.../vectorhighlight/SimpleBoundaryScanner.java | 46 +-
.../vectorhighlight/SimpleFieldFragList.java | 25 +-
.../vectorhighlight/SimpleFragListBuilder.java | 14 +-
.../vectorhighlight/SimpleFragmentsBuilder.java | 30 +-
.../vectorhighlight/SingleFragListBuilder.java | 25 +-
.../vectorhighlight/WeightedFieldFragList.java | 56 +-
.../vectorhighlight/WeightedFragListBuilder.java | 12 +-
.../search/vectorhighlight/package-info.java | 150 +-
.../lucene/search/highlight/TestHighlighter.java | 2100 +++++++++++---------
.../search/highlight/TestHighlighterPhrase.java | 259 +--
.../apache/lucene/search/highlight/TestMisses.java | 36 +-
.../highlight/TestOffsetLimitTokenFilter.java | 38 +-
.../lucene/search/highlight/TestTokenSources.java | 219 +-
.../highlight/custom/TestHighlightCustomQuery.java | 75 +-
.../search/matchhighlight/AnalyzerWithGaps.java | 6 +-
.../matchhighlight/AsciiMatchRangeHighlighter.java | 12 +-
.../lucene/search/matchhighlight/IndexBuilder.java | 22 +-
.../search/matchhighlight/MissingAnalyzer.java | 3 +-
.../matchhighlight/TestMatchHighlighter.java | 683 ++++---
.../matchhighlight/TestMatchRegionRetriever.java | 762 +++----
.../search/matchhighlight/TestPassageSelector.java | 11 +-
.../TestCustomSeparatorBreakIterator.java | 33 +-
.../uhighlight/TestDefaultPassageFormatter.java | 22 +-
.../uhighlight/TestLengthGoalBreakIterator.java | 228 ++-
.../uhighlight/TestSplittingBreakIterator.java | 66 +-
.../search/uhighlight/TestUnifiedHighlighter.java | 718 ++++---
.../uhighlight/TestUnifiedHighlighterMTQ.java | 431 ++--
.../uhighlight/TestUnifiedHighlighterRanking.java | 155 +-
.../TestUnifiedHighlighterReanalysis.java | 28 +-
.../TestUnifiedHighlighterStrictPhrases.java | 609 +++---
.../TestUnifiedHighlighterTermIntervals.java | 467 +++--
.../uhighlight/TestUnifiedHighlighterTermVec.java | 90 +-
.../search/uhighlight/TestWholeBreakIterator.java | 56 +-
.../lucene/search/uhighlight/UHTestHelper.java | 20 +-
.../TestUnifiedHighlighterExtensibility.java | 344 ++--
.../search/vectorhighlight/AbstractTestCase.java | 270 ++-
.../TestBreakIteratorBoundaryScanner.java | 25 +-
.../vectorhighlight/TestFastVectorHighlighter.java | 798 +++++---
.../vectorhighlight/TestFieldPhraseList.java | 352 ++--
.../search/vectorhighlight/TestFieldQuery.java | 1105 +++++-----
.../search/vectorhighlight/TestFieldTermStack.java | 254 +--
.../vectorhighlight/TestIndexTimeSynonym.java | 366 ++--
.../TestScoreOrderFragmentsBuilder.java | 26 +-
.../vectorhighlight/TestSimpleBoundaryScanner.java | 12 +-
.../vectorhighlight/TestSimpleFragListBuilder.java | 207 +-
.../TestSimpleFragmentsBuilder.java | 209 +-
.../vectorhighlight/TestSingleFragListBuilder.java | 45 +-
.../TestWeightedFragListBuilder.java | 39 +-
186 files changed, 11053 insertions(+), 9746 deletions(-)
diff --git a/gradle/validation/spotless.gradle b/gradle/validation/spotless.gradle
index 87fcdb6..8a06eef 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -33,7 +33,26 @@ allprojects { prj ->
googleJavaFormat('1.9')
switch (project.path) {
- // Disable for everything else for now.
+ // These modules are complete - all sources scanned.
+ case ":lucene:highlighter":
+ target "src/**"
+ targetExclude "**/resources/**", "**/CambridgeMA.utf8", "**/overview.html"
+ break
+
+ // Partially complete.
+ case ":lucene:core":
+ target "src/**/org/apache/lucene/analysis/standard/**",
+ "src/**/org/apache/lucene/analysis/tokenattributes/**"
+ targetExclude "**/resources/**", "**/StandardTokenizerImpl.jflex", "**/StandardTokenizerImpl.java"
+ break
+
+ case ":lucene:analysis:common":
+ target "src/**/org/apache/lucene/analysis/ar/**",
+ "src/**/org/apache/lucene/collation/**"
+ targetExclude "**/resources/**"
+ break
+
+ // All others - disable reformatting/ checks for now.
default:
target 'non-existing/**'
break
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index 90a60a5..ce679b1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
import java.io.Reader;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -32,19 +30,17 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
- * {@link Analyzer} for Arabic.
- * <p>
- * This analyzer implements light-stemming as specified by:
- * <i>
- * Light Stemming for Arabic Information Retrieval
- * </i>
- * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
- * <p>
- * The analysis package contains three primary components:
+ * {@link Analyzer} for Arabic.
+ *
+ * <p>This analyzer implements light-stemming as specified by: <i> Light Stemming for Arabic
+ * Information Retrieval </i> http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
+ *
+ * <p>The analysis package contains three primary components:
+ *
* <ul>
- * <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
- * <li>{@link ArabicStemFilter}: Arabic light stemming
- * <li>Arabic stop words file: a set of default Arabic stop words.
+ * <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ * <li>{@link ArabicStemFilter}: Arabic light stemming
+ * <li>Arabic stop words file: a set of default Arabic stop words.
* </ul>
*
* @since 3.1
@@ -53,23 +49,24 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Arabic stopwords.
- *
- * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
- * The stopword list is BSD-Licensed.
+ *
+ * <p>Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html The
+ * stopword list is BSD-Licensed.
*/
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
+ *
* @return an unmodifiable instance of the default stop-words set.
*/
- public static CharArraySet getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
/**
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
- * accesses the static final set the first time.;
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+ * static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
@@ -84,51 +81,43 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
}
}
}
-
+
private final CharArraySet stemExclusionSet;
- /**
- * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
- */
+ /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public ArabicAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
-
+
/**
* Builds an analyzer with the given stop words
- *
- * @param stopwords
- * a stopword set
+ *
+ * @param stopwords a stopword set
*/
- public ArabicAnalyzer(CharArraySet stopwords){
+ public ArabicAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
/**
- * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
- * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
- * {@link ArabicStemFilter}.
- *
- * @param stopwords
- * a stopword set
- * @param stemExclusionSet
- * a set of terms not to be stemmed
+ * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is provided
+ * this analyzer will add a {@link SetKeywordMarkerFilter} before {@link ArabicStemFilter}.
+ *
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
*/
- public ArabicAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet){
+ public ArabicAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
/**
- * Creates
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * used to tokenize all the text in the provided {@link Reader}.
- *
- * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from an {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
- * {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
- * if a stem exclusion set is provided and {@link ArabicStemFilter}.
+ * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
+ * the text in the provided {@link Reader}.
+ *
+ * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an {@link
+ * StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
+ * {@link StopFilter}, {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a
+ * stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
@@ -139,7 +128,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
result = new StopFilter(result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result);
- if(!stemExclusionSet.isEmpty()) {
+ if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
@@ -153,4 +142,3 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
return result;
}
}
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
index ba94106..8ac1b0d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@@ -16,22 +16,16 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-/**
- * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
- *
- */
-
+/** A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography. */
public final class ArabicNormalizationFilter extends TokenFilter {
private final ArabicNormalizer normalizer = new ArabicNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
+
public ArabicNormalizationFilter(TokenStream input) {
super(input);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
index 5f604f5..937752e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link ArabicNormalizationFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -41,7 +40,7 @@ public class ArabicNormalizationFilterFactory extends TokenFilterFactory {
public static final String NAME = "arabicNormalization";
/** Creates a new ArabicNormalizationFilterFactory */
- public ArabicNormalizationFilterFactory(Map<String,String> args) {
+ public ArabicNormalizationFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
index 92a818d..bea82f6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
@@ -16,23 +16,22 @@
*/
package org.apache.lucene.analysis.ar;
-
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
- * Normalizer for Arabic.
- * <p>
- * Normalization is done in-place for efficiency, operating on a termbuffer.
- * <p>
- * Normalization is defined as:
- * <ul>
- * <li> Normalization of hamza with alef seat to a bare alef.
- * <li> Normalization of teh marbuta to heh
- * <li> Normalization of dotless yeh (alef maksura) to yeh.
- * <li> Removal of Arabic diacritics (the harakat)
- * <li> Removal of tatweel (stretching character).
- * </ul>
+ * Normalizer for Arabic.
+ *
+ * <p>Normalization is done in-place for efficiency, operating on a termbuffer.
*
+ * <p>Normalization is defined as:
+ *
+ * <ul>
+ * <li>Normalization of hamza with alef seat to a bare alef.
+ * <li>Normalization of teh marbuta to heh
+ * <li>Normalization of dotless yeh (alef maksura) to yeh.
+ * <li>Removal of Arabic diacritics (the harakat)
+ * <li>Removal of tatweel (stretching character).
+ * </ul>
*/
public class ArabicNormalizer {
public static final char ALEF = '\u0627';
@@ -59,7 +58,7 @@ public class ArabicNormalizer {
/**
* Normalize an input buffer of Arabic text
- *
+ *
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
@@ -68,31 +67,31 @@ public class ArabicNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
- case ALEF_MADDA:
- case ALEF_HAMZA_ABOVE:
- case ALEF_HAMZA_BELOW:
- s[i] = ALEF;
- break;
- case DOTLESS_YEH:
- s[i] = YEH;
- break;
- case TEH_MARBUTA:
- s[i] = HEH;
- break;
- case TATWEEL:
- case KASRATAN:
- case DAMMATAN:
- case FATHATAN:
- case FATHA:
- case DAMMA:
- case KASRA:
- case SHADDA:
- case SUKUN:
- len = delete(s, i, len);
- i--;
- break;
- default:
- break;
+ case ALEF_MADDA:
+ case ALEF_HAMZA_ABOVE:
+ case ALEF_HAMZA_BELOW:
+ s[i] = ALEF;
+ break;
+ case DOTLESS_YEH:
+ s[i] = YEH;
+ break;
+ case TEH_MARBUTA:
+ s[i] = HEH;
+ break;
+ case TATWEEL:
+ case KASRATAN:
+ case DAMMATAN:
+ case FATHATAN:
+ case FATHA:
+ case DAMMA:
+ case KASRA:
+ case SHADDA:
+ case SUKUN:
+ len = delete(s, i, len);
+ i--;
+ break;
+ default:
+ break;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
index f1d0fa3..9005ea9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@@ -16,29 +16,27 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
-
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
- * <p>
- * To prevent terms from being stemmed use an instance of
- * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
- * the {@link KeywordAttribute} before this {@link TokenStream}.
- * </p>
- * @see SetKeywordMarkerFilter */
-
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
+ * @see SetKeywordMarkerFilter
+ */
public final class ArabicStemFilter extends TokenFilter {
private final ArabicStemmer stemmer = new ArabicStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
-
+
public ArabicStemFilter(TokenStream input) {
super(input);
}
@@ -46,7 +44,7 @@ public final class ArabicStemFilter extends TokenFilter {
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- if(!keywordAttr.isKeyword()) {
+ if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
index 96c225a..9f2e702 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link ArabicStemFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -42,7 +41,7 @@ public class ArabicStemFilterFactory extends TokenFilterFactory {
public static final String NAME = "arabicStem";
/** Creates a new ArabicStemFilterFactory */
- public ArabicStemFilterFactory(Map<String,String> args) {
+ public ArabicStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
index 6093317..5675bcf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
@@ -16,21 +16,19 @@
*/
package org.apache.lucene.analysis.ar;
-
-
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
- * Stemmer for Arabic.
- * <p>
- * Stemming is done in-place for efficiency, operating on a termbuffer.
- * <p>
- * Stemming is defined as:
- * <ul>
- * <li> Removal of attached definite article, conjunction, and prepositions.
- * <li> Stemming of common suffixes.
- * </ul>
+ * Stemmer for Arabic.
+ *
+ * <p>Stemming is done in-place for efficiency, operating on a termbuffer.
+ *
+ * <p>Stemming is defined as:
*
+ * <ul>
+ * <li>Removal of attached definite article, conjunction, and prepositions.
+ * <li>Stemming of common suffixes.
+ * </ul>
*/
public class ArabicStemmer {
public static final char ALEF = '\u0627';
@@ -44,33 +42,33 @@ public class ArabicStemmer {
public static final char HEH = '\u0647';
public static final char WAW = '\u0648';
public static final char YEH = '\u064A';
-
+
public static final char prefixes[][] = {
- ("" + ALEF + LAM).toCharArray(),
- ("" + WAW + ALEF + LAM).toCharArray(),
- ("" + BEH + ALEF + LAM).toCharArray(),
- ("" + KAF + ALEF + LAM).toCharArray(),
- ("" + FEH + ALEF + LAM).toCharArray(),
- ("" + LAM + LAM).toCharArray(),
- ("" + WAW).toCharArray(),
+ ("" + ALEF + LAM).toCharArray(),
+ ("" + WAW + ALEF + LAM).toCharArray(),
+ ("" + BEH + ALEF + LAM).toCharArray(),
+ ("" + KAF + ALEF + LAM).toCharArray(),
+ ("" + FEH + ALEF + LAM).toCharArray(),
+ ("" + LAM + LAM).toCharArray(),
+ ("" + WAW).toCharArray(),
};
-
+
public static final char suffixes[][] = {
- ("" + HEH + ALEF).toCharArray(),
- ("" + ALEF + NOON).toCharArray(),
- ("" + ALEF + TEH).toCharArray(),
- ("" + WAW + NOON).toCharArray(),
- ("" + YEH + NOON).toCharArray(),
+ ("" + HEH + ALEF).toCharArray(),
+ ("" + ALEF + NOON).toCharArray(),
+ ("" + ALEF + TEH).toCharArray(),
+ ("" + WAW + NOON).toCharArray(),
+ ("" + YEH + NOON).toCharArray(),
("" + YEH + HEH).toCharArray(),
("" + YEH + TEH_MARBUTA).toCharArray(),
("" + HEH).toCharArray(),
("" + TEH_MARBUTA).toCharArray(),
("" + YEH).toCharArray(),
-};
-
+ };
+
/**
* Stem an input buffer of Arabic text.
- *
+ *
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
@@ -78,38 +76,40 @@ public class ArabicStemmer {
public int stem(char s[], int len) {
len = stemPrefix(s, len);
len = stemSuffix(s, len);
-
+
return len;
}
-
+
/**
* Stem a prefix off an Arabic word.
+ *
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming.
*/
public int stemPrefix(char s[], int len) {
- for (int i = 0; i < prefixes.length; i++)
- if (startsWithCheckLength(s, len, prefixes[i]))
- return deleteN(s, 0, len, prefixes[i].length);
+ for (int i = 0; i < prefixes.length; i++)
+ if (startsWithCheckLength(s, len, prefixes[i])) return deleteN(s, 0, len, prefixes[i].length);
return len;
}
/**
* Stem suffix(es) off an Arabic word.
+ *
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
public int stemSuffix(char s[], int len) {
- for (int i = 0; i < suffixes.length; i++)
+ for (int i = 0; i < suffixes.length; i++)
if (endsWithCheckLength(s, len, suffixes[i]))
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
return len;
}
-
+
/**
* Returns true if the prefix matches and can be stemmed
+ *
* @param s input buffer
* @param len length of input buffer
* @param prefix prefix to check
@@ -121,16 +121,19 @@ public class ArabicStemmer {
} else if (len < prefix.length + 2) { // other prefixes require only 2.
return false;
} else {
- for (int i = 0; i < prefix.length; i++)
- if (s[i] != prefix[i])
+ for (int i = 0; i < prefix.length; i++) {
+ if (s[i] != prefix[i]) {
return false;
-
+ }
+ }
+
return true;
}
}
-
+
/**
* Returns true if the suffix matches and can be stemmed
+ *
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
@@ -140,11 +143,13 @@ public class ArabicStemmer {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false;
} else {
- for (int i = 0; i < suffix.length; i++)
- if (s[len - suffix.length + i] != suffix[i])
+ for (int i = 0; i < suffix.length; i++) {
+ if (s[len - suffix.length + i] != suffix[i]) {
return false;
-
+ }
+ }
+
return true;
}
- }
+ }
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/package-info.java
index dfb2cab..3d2148b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Arabic.
- */
-package org.apache.lucene.analysis.ar;
\ No newline at end of file
+/** Analyzer for Arabic. */
+package org.apache.lucene.analysis.ar;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java
index 6de59c8..3058b68 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java
@@ -16,73 +16,59 @@
*/
package org.apache.lucene.collation;
-
import java.text.Collator;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl;
import org.apache.lucene.util.AttributeFactory;
/**
- * <p>
- * Converts each token into its {@link java.text.CollationKey}, and then
- * encodes the bytes as an index term.
- * </p>
- * <p>
- * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
- * index and query time -- CollationKeys are only comparable when produced by
- * the same Collator. Since {@link java.text.RuleBasedCollator}s are not
- * independently versioned, it is unsafe to search against stored
- * CollationKeys unless the following are exactly the same (best practice is
- * to store this information with the index and check that they remain the
- * same at query time):
- * </p>
+ * Converts each token into its {@link java.text.CollationKey}, and then encodes the bytes as an
+ * index term.
+ *
+ * <p><strong>WARNING:</strong> Make sure you use exactly the same Collator at index and query time
+ * -- CollationKeys are only comparable when produced by the same Collator. Since {@link
+ * java.text.RuleBasedCollator}s are not independently versioned, it is unsafe to search against
+ * stored CollationKeys unless the following are exactly the same (best practice is to store this
+ * information with the index and check that they remain the same at query time):
+ *
* <ol>
- * <li>JVM vendor</li>
- * <li>JVM version, including patch version</li>
- * <li>
- * The language (and country and variant, if specified) of the Locale
- * used when constructing the collator via
- * {@link Collator#getInstance(java.util.Locale)}.
- * </li>
- * <li>
- * The collation strength used - see {@link Collator#setStrength(int)}
- * </li>
- * </ol>
- * <p>
- * The <code>ICUCollationAttributeFactory</code> in the analysis-icu package
- * uses ICU4J's Collator, which makes its
- * version available, thus allowing collation to be versioned independently
- * from the JVM. ICUCollationAttributeFactory is also significantly faster and
- * generates significantly shorter keys than CollationAttributeFactory. See
- * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
- * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
- * generation timing and key length comparisons between ICU4J and
- * java.text.Collator over several languages.
- * </p>
- * <p>
- * CollationKeys generated by java.text.Collators are not compatible
- * with those those generated by ICU Collators. Specifically, if you use
- * CollationAttributeFactory to generate index terms, do not use
- * ICUCollationAttributeFactory on the query side, or vice versa.
- * </p>
+ * <li>JVM vendor
+ * <li>JVM version, including patch version
+ * <li>The language (and country and variant, if specified) of the Locale used when constructing
+ * the collator via {@link Collator#getInstance(java.util.Locale)}.
+ * <li>The collation strength used - see {@link Collator#setStrength(int)}
+ * </ol>
+ *
+ * <p>The <code>ICUCollationAttributeFactory</code> in the analysis-icu package uses ICU4J's
+ * Collator, which makes its version available, thus allowing collation to be versioned
+ * independently from the JVM. ICUCollationAttributeFactory is also significantly faster and
+ * generates significantly shorter keys than CollationAttributeFactory. See <a
+ * href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key generation timing and key
+ * length comparisons between ICU4J and java.text.Collator over several languages.
+ *
+ * <p>CollationKeys generated by java.text.Collators are not compatible with those those generated
+ * by ICU Collators. Specifically, if you use CollationAttributeFactory to generate index terms, do
+ * not use ICUCollationAttributeFactory on the query side, or vice versa.
*/
-public class CollationAttributeFactory extends AttributeFactory.StaticImplementationAttributeFactory<CollatedTermAttributeImpl> {
+public class CollationAttributeFactory
+ extends AttributeFactory.StaticImplementationAttributeFactory<CollatedTermAttributeImpl> {
private final Collator collator;
-
+
/**
- * Create a CollationAttributeFactory, using
- * {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY} as the
- * factory for all other attributes.
+ * Create a CollationAttributeFactory, using {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}
+ * as the factory for all other attributes.
+ *
* @param collator CollationKey generator
*/
public CollationAttributeFactory(Collator collator) {
this(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, collator);
}
-
+
/**
- * Create a CollationAttributeFactory, using the supplied Attribute Factory
- * as the factory for all other attributes.
+ * Create a CollationAttributeFactory, using the supplied Attribute Factory as the factory for all
+ * other attributes.
+ *
* @param delegate Attribute Factory
* @param collator CollationKey generator
*/
@@ -90,7 +76,7 @@ public class CollationAttributeFactory extends AttributeFactory.StaticImplementa
super(delegate, CollatedTermAttributeImpl.class);
this.collator = collator;
}
-
+
@Override
public CollatedTermAttributeImpl createInstance() {
return new CollatedTermAttributeImpl(collator);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java
index 507abfd..f34285f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationDocValuesField.java
@@ -16,38 +16,35 @@
*/
package org.apache.lucene.collation;
-
import java.text.Collator;
-
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.util.BytesRef;
/**
* Indexes collation keys as a single-valued {@link SortedDocValuesField}.
- * <p>
- * This is more efficient that {@link CollationKeyAnalyzer} if the field
- * only has one value: no uninversion is necessary to sort on the field,
- * locale-sensitive range queries can still work via {@code DocValuesRangeQuery},
- * and the underlying data structures built at index-time are likely more efficient
- * and use less memory than FieldCache.
+ *
+ * <p>This is more efficient that {@link CollationKeyAnalyzer} if the field only has one value: no
+ * uninversion is necessary to sort on the field, locale-sensitive range queries can still work via
+ * {@code DocValuesRangeQuery}, and the underlying data structures built at index-time are likely
+ * more efficient and use less memory than FieldCache.
*/
public final class CollationDocValuesField extends Field {
private final String name;
private final Collator collator;
private final BytesRef bytes = new BytesRef();
-
+
/**
* Create a new ICUCollationDocValuesField.
- * <p>
- * NOTE: you should not create a new one for each document, instead
- * just make one and reuse it during your indexing process, setting
- * the value via {@link #setStringValue(String)}.
+ *
+ * <p>NOTE: you should not create a new one for each document, instead just make one and reuse it
+ * during your indexing process, setting the value via {@link #setStringValue(String)}.
+ *
* @param name field name
* @param collator Collator for generating collation keys.
*/
// TODO: can we make this trap-free? maybe just synchronize on the collator
- // instead?
+ // instead?
public CollationDocValuesField(String name, Collator collator) {
super(name, SortedDocValuesField.TYPE);
this.name = name;
@@ -59,7 +56,7 @@ public final class CollationDocValuesField extends Field {
public String name() {
return name;
}
-
+
@Override
public void setStringValue(String value) {
bytes.bytes = collator.getCollationKey(value).toByteArray();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index d2099fb..fa8d6ba 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -16,67 +16,48 @@
*/
package org.apache.lucene.collation;
-
-
+import java.text.Collator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.AttributeFactory;
-import java.text.Collator;
-
/**
- * <p>
- * Configures {@link KeywordTokenizer} with {@link CollationAttributeFactory}.
- * </p>
- * <p>
- * Converts the token into its {@link java.text.CollationKey}, and then
- * encodes the CollationKey directly to allow
- * it to be stored as an index term.
- * </p>
- * <p>
- * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
- * index and query time -- CollationKeys are only comparable when produced by
- * the same Collator. Since {@link java.text.RuleBasedCollator}s are not
- * independently versioned, it is unsafe to search against stored
- * CollationKeys unless the following are exactly the same (best practice is
- * to store this information with the index and check that they remain the
- * same at query time):
- * </p>
+ * Configures {@link KeywordTokenizer} with {@link CollationAttributeFactory}.
+ *
+ * <p>Converts the token into its {@link java.text.CollationKey}, and then encodes the CollationKey
+ * directly to allow it to be stored as an index term.
+ *
+ * <p><strong>WARNING:</strong> Make sure you use exactly the same Collator at index and query time
+ * -- CollationKeys are only comparable when produced by the same Collator. Since {@link
+ * java.text.RuleBasedCollator}s are not independently versioned, it is unsafe to search against
+ * stored CollationKeys unless the following are exactly the same (best practice is to store this
+ * information with the index and check that they remain the same at query time):
+ *
* <ol>
- * <li>JVM vendor</li>
- * <li>JVM version, including patch version</li>
- * <li>
- * The language (and country and variant, if specified) of the Locale
- * used when constructing the collator via
- * {@link Collator#getInstance(java.util.Locale)}.
- * </li>
- * <li>
- * The collation strength used - see {@link Collator#setStrength(int)}
- * </li>
- * </ol>
- * <p>
- * The <code>ICUCollationKeyAnalyzer</code> in the analysis-icu package
- * uses ICU4J's Collator, which makes
- * its version available, thus allowing collation to be versioned
- * independently from the JVM. ICUCollationKeyAnalyzer is also significantly
- * faster and generates significantly shorter keys than CollationKeyAnalyzer.
- * See <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
- * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
- * generation timing and key length comparisons between ICU4J and
- * java.text.Collator over several languages.
- * </p>
- * <p>
- * CollationKeys generated by java.text.Collators are not compatible
- * with those those generated by ICU Collators. Specifically, if you use
- * CollationKeyAnalyzer to generate index terms, do not use
- * ICUCollationKeyAnalyzer on the query side, or vice versa.
- * </p>
+ * <li>JVM vendor
+ * <li>JVM version, including patch version
+ * <li>The language (and country and variant, if specified) of the Locale used when constructing
+ * the collator via {@link Collator#getInstance(java.util.Locale)}.
+ * <li>The collation strength used - see {@link Collator#setStrength(int)}
+ * </ol>
+ *
+ * <p>The <code>ICUCollationKeyAnalyzer</code> in the analysis-icu package uses ICU4J's Collator,
+ * which makes its version available, thus allowing collation to be versioned independently from the
+ * JVM. ICUCollationKeyAnalyzer is also significantly faster and generates significantly shorter
+ * keys than CollationKeyAnalyzer. See <a
+ * href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key generation timing and key
+ * length comparisons between ICU4J and java.text.Collator over several languages.
+ *
+ * <p>CollationKeys generated by java.text.Collators are not compatible with those those generated
+ * by ICU Collators. Specifically, if you use CollationKeyAnalyzer to generate index terms, do not
+ * use ICUCollationKeyAnalyzer on the query side, or vice versa.
*
* @since 3.1
*/
public final class CollationKeyAnalyzer extends Analyzer {
private final CollationAttributeFactory factory;
-
+
/**
* Create a new CollationKeyAnalyzer, using the specified collator.
*
@@ -93,7 +74,8 @@ public final class CollationKeyAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
- KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ KeywordTokenizer tokenizer =
+ new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
index c79b58c..f9c0ffc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/package-info.java
@@ -16,35 +16,28 @@
*/
/**
- * Unicode collation support.
- * <p>
- * <code>Collation</code> converts each token into its binary <code>CollationKey</code>
- * using the provided <code>Collator</code>, allowing it to be stored as an index term.
- * </p>
- *
+ * Unicode collation support.
+ *
+ * <p><code>Collation</code> converts each token into its binary <code>CollationKey</code> using the
+ * provided <code>Collator</code>, allowing it to be stored as an index term.
+ *
* <h2>Use Cases</h2>
- *
+ *
* <ul>
- * <li>
- * Efficient sorting of terms in languages that use non-Unicode character
- * orderings. (Lucene Sort using a Locale can be very slow.)
- * </li>
- * <li>
- * Efficient range queries over fields that contain terms in languages that
- * use non-Unicode character orderings. (Range queries using a Locale can be
- * very slow.)
- * </li>
- * <li>
- * Effective Locale-specific normalization (case differences, diacritics, etc.).
- * ({@link org.apache.lucene.analysis.LowerCaseFilter} and
- * {@link org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter} provide these services
- * in a generic way that doesn't take into account locale-specific needs.)
- * </li>
+ * <li>Efficient sorting of terms in languages that use non-Unicode character orderings. (Lucene
+ * Sort using a Locale can be very slow.)
+ * <li>Efficient range queries over fields that contain terms in languages that use non-Unicode
+ * character orderings. (Range queries using a Locale can be very slow.)
+ * <li>Effective Locale-specific normalization (case differences, diacritics, etc.). ({@link
+ * org.apache.lucene.analysis.LowerCaseFilter} and {@link
+ * org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter} provide these services in a
+ * generic way that doesn't take into account locale-specific needs.)
* </ul>
- *
+ *
* <h2>Example Usages</h2>
- *
+ *
* <h3>Farsi Range Queries</h3>
+ *
* <pre class="prettyprint">
* // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
* Collator collator = Collator.getInstance(new Locale("ar"));
@@ -58,10 +51,10 @@
* writer.close();
* IndexReader ir = DirectoryReader.open(dir);
* IndexSearcher is = new IndexSearcher(ir);
- *
+ *
* QueryParser aqp = new QueryParser("content", analyzer);
* aqp.setAnalyzeRangeTerms(true);
- *
+ *
* // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
* // orders the U+0698 character before the U+0633 character, so the single
* // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -71,10 +64,11 @@
* = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
* assertEquals("The index Term should not be included.", 0, result.length);
* </pre>
- *
+ *
* <h3>Danish Sorting</h3>
+ *
* <pre class="prettyprint">
- * Analyzer analyzer
+ * Analyzer analyzer
* = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
* Path dirPath = Files.createTempDirectory("tempIndex");
* Directory dir = FSDirectory.open(dirPath);
@@ -100,8 +94,9 @@
* assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
* }
* </pre>
- *
+ *
* <h3>Turkish Case Normalization</h3>
+ *
* <pre class="prettyprint">
* Collator collator = Collator.getInstance(new Locale("tr", "TR"));
* collator.setStrength(Collator.PRIMARY);
@@ -120,47 +115,36 @@
* ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
* assertEquals("The index Term should be included.", 1, result.length);
* </pre>
- *
+ *
* <h2>Caveats and Comparisons</h2>
- * <p>
- * <strong>WARNING:</strong> Make sure you use exactly the same
- * <code>Collator</code> at index and query time -- <code>CollationKey</code>s
- * are only comparable when produced by
- * the same <code>Collator</code>. Since {@link java.text.RuleBasedCollator}s
- * are not independently versioned, it is unsafe to search against stored
- * <code>CollationKey</code>s unless the following are exactly the same (best
- * practice is to store this information with the index and check that they
- * remain the same at query time):
- * </p>
+ *
+ * <p><strong>WARNING:</strong> Make sure you use exactly the same <code>Collator</code> at index
+ * and query time -- <code>CollationKey</code>s are only comparable when produced by the same <code>
+ * Collator</code>. Since {@link java.text.RuleBasedCollator}s are not independently versioned, it
+ * is unsafe to search against stored <code>CollationKey</code>s unless the following are exactly
+ * the same (best practice is to store this information with the index and check that they remain
+ * the same at query time):
+ *
* <ol>
- * <li>JVM vendor</li>
- * <li>JVM version, including patch version</li>
- * <li>
- * The language (and country and variant, if specified) of the Locale
- * used when constructing the collator via
- * {@link java.text.Collator#getInstance(java.util.Locale)}.
- * </li>
- * <li>
- * The collation strength used - see {@link java.text.Collator#setStrength(int)}
- * </li>
- * </ol>
- * <p>
- * <code>ICUCollationKeyAnalyzer</code>, available in the <a href="{@docRoot}/../icu/overview-summary.html">icu analysis module</a>,
- * uses ICU4J's <code>Collator</code>, which
- * makes its version available, thus allowing collation to be versioned
- * independently from the JVM. <code>ICUCollationKeyAnalyzer</code> is also
- * significantly faster and generates significantly shorter keys than
- * <code>CollationKeyAnalyzer</code>. See
- * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
- * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
- * generation timing and key length comparisons between ICU4J and
- * <code>java.text.Collator</code> over several languages.
- * </p>
- * <p>
- * <code>CollationKey</code>s generated by <code>java.text.Collator</code>s are
- * not compatible with those those generated by ICU Collators. Specifically, if
- * you use <code>CollationKeyAnalyzer</code> to generate index terms, do not use
- * <code>ICUCollationKeyAnalyzer</code> on the query side, or vice versa.
- * </p>
+ * <li>JVM vendor
+ * <li>JVM version, including patch version
+ * <li>The language (and country and variant, if specified) of the Locale used when constructing
+ * the collator via {@link java.text.Collator#getInstance(java.util.Locale)}.
+ * <li>The collation strength used - see {@link java.text.Collator#setStrength(int)}
+ * </ol>
+ *
+ * <p><code>ICUCollationKeyAnalyzer</code>, available in the <a
+ * href="{@docRoot}/../icu/overview-summary.html">icu analysis module</a>, uses ICU4J's <code>
+ * Collator</code>, which makes its version available, thus allowing collation to be versioned
+ * independently from the JVM. <code>ICUCollationKeyAnalyzer</code> is also significantly faster and
+ * generates significantly shorter keys than <code>CollationKeyAnalyzer</code>. See <a
+ * href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key generation timing and key
+ * length comparisons between ICU4J and <code>java.text.Collator</code> over several languages.
+ *
+ * <p><code>CollationKey</code>s generated by <code>java.text.Collator</code>s are not compatible
+ * with those those generated by ICU Collators. Specifically, if you use <code>CollationKeyAnalyzer
+ * </code> to generate index terms, do not use <code>ICUCollationKeyAnalyzer</code> on the query
+ * side, or vice versa.
*/
package org.apache.lucene.collation;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
index d997505..aa3b082 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
@@ -16,21 +16,20 @@
*/
package org.apache.lucene.collation.tokenattributes;
-
import java.text.Collator;
-
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.util.BytesRef;
/**
- * Extension of {@link CharTermAttributeImpl} that encodes the term
- * text as a binary Unicode collation key instead of as UTF-8 bytes.
+ * Extension of {@link CharTermAttributeImpl} that encodes the term text as a binary Unicode
+ * collation key instead of as UTF-8 bytes.
*/
public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
private final Collator collator;
/**
* Create a new CollatedTermAttributeImpl
+ *
* @param collator Collation key generator
*/
public CollatedTermAttributeImpl(Collator collator) {
@@ -38,7 +37,7 @@ public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
// or to reduce contention in case they do
this.collator = (Collator) collator.clone();
}
-
+
@Override
public BytesRef getBytesRef() {
final BytesRef ref = this.builder.get();
@@ -47,5 +46,4 @@ public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
ref.length = ref.bytes.length;
return ref;
}
-
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
index 2ace0ee..a4591d0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
@@ -16,99 +16,85 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
-/**
- * Test the Arabic Analyzer
- *
- */
+/** Test the Arabic Analyzer */
public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
-
- /** This test fails with NPE when the
- * stopwords file is missing in classpath */
+
+ /** This test fails with NPE when the stopwords file is missing in classpath */
public void testResourcesAvailable() {
new ArabicAnalyzer().close();
}
-
+
/**
* Some simple tests showing some features of the analyzer, how some regular forms will conflate
*/
public void testBasicFeatures() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
- assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
- assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
-
- assertAnalyzesTo(a, "مشروب", new String[] { "مشروب" });
- assertAnalyzesTo(a, "مشروبات", new String[] { "مشروب" }); // plural -at
-
- assertAnalyzesTo(a, "أمريكيين", new String[] { "امريك" }); // plural -in
- assertAnalyzesTo(a, "امريكي", new String[] { "امريك" }); // singular with bare alif
-
- assertAnalyzesTo(a, "كتاب", new String[] { "كتاب" });
- assertAnalyzesTo(a, "الكتاب", new String[] { "كتاب" }); // definite article
-
- assertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] { "ملكت", "ايمانكم"});
- assertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" }); // stopwords
+ assertAnalyzesTo(a, "كبير", new String[] {"كبير"});
+ assertAnalyzesTo(a, "كبيرة", new String[] {"كبير"}); // feminine marker
+
+ assertAnalyzesTo(a, "مشروب", new String[] {"مشروب"});
+ assertAnalyzesTo(a, "مشروبات", new String[] {"مشروب"}); // plural -at
+
+ assertAnalyzesTo(a, "أمريكيين", new String[] {"امريك"}); // plural -in
+ assertAnalyzesTo(a, "امريكي", new String[] {"امريك"}); // singular with bare alif
+
+ assertAnalyzesTo(a, "كتاب", new String[] {"كتاب"});
+ assertAnalyzesTo(a, "الكتاب", new String[] {"كتاب"}); // definite article
+
+ assertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] {"ملكت", "ايمانكم"});
+ assertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] {"ملكت", "ايمانكم"}); // stopwords
a.close();
}
-
- /**
- * Simple tests to show things are getting reset correctly, etc.
- */
+
+ /** Simple tests to show things are getting reset correctly, etc. */
public void testReusableTokenStream() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
- assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
- assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
+ assertAnalyzesTo(a, "كبير", new String[] {"كبير"});
+ assertAnalyzesTo(a, "كبيرة", new String[] {"كبير"}); // feminine marker
a.close();
}
- /**
- * Non-arabic text gets treated in a similar way as SimpleAnalyzer.
- */
+ /** Non-arabic text gets treated in a similar way as SimpleAnalyzer. */
public void testEnglishInput() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
- assertAnalyzesTo(a, "English text.", new String[] {
- "english", "text" });
+ assertAnalyzesTo(a, "English text.", new String[] {"english", "text"});
a.close();
}
-
- /**
- * Test that custom stopwords work, and are not case-sensitive.
- */
+
+ /** Test that custom stopwords work, and are not case-sensitive. */
public void testCustomStopwords() throws Exception {
CharArraySet set = new CharArraySet(asSet("the", "and", "a"), false);
ArabicAnalyzer a = new ArabicAnalyzer(set);
- assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
- "brown", "fox" });
+ assertAnalyzesTo(a, "The quick brown fox.", new String[] {"quick", "brown", "fox"});
a.close();
}
-
+
public void testWithStemExclusionSet() throws IOException {
CharArraySet set = new CharArraySet(asSet("ساهدهات"), false);
ArabicAnalyzer a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, set);
- assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
- assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
+ assertAnalyzesTo(
+ a, "كبيرة the quick ساهدهات", new String[] {"كبير", "the", "quick", "ساهدهات"});
+ assertAnalyzesTo(
+ a, "كبيرة the quick ساهدهات", new String[] {"كبير", "the", "quick", "ساهدهات"});
a.close();
-
+
a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
- assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
- assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
+ assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] {"كبير", "the", "quick", "ساهد"});
+ assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] {"كبير", "the", "quick", "ساهد"});
a.close();
}
-
- /**
- * test we fold digits to latin-1
- */
+
+ /** test we fold digits to latin-1 */
public void testDigits() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
checkOneTerm(a, "١٢٣٤", "1234");
a.close();
}
-
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java
index f3f833d8..246210e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java
@@ -16,32 +16,24 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.Reader;
import java.io.StringReader;
-
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
-/**
- * Simple tests to ensure the Arabic filter Factories are working.
- */
+/** Simple tests to ensure the Arabic filter Factories are working. */
public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
-
- /**
- * Test ArabicNormalizationFilterFactory
- */
+
+ /** Test ArabicNormalizationFilterFactory */
public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
TokenStream stream = tokenFilterFactory("ArabicNormalization").create(tokenizer);
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
}
-
- /**
- * Test ArabicStemFilterFactory
- */
+
+ /** Test ArabicStemFilterFactory */
public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
@@ -49,31 +41,38 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
stream = tokenFilterFactory("ArabicStem").create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
}
-
- /**
- * Test PersianCharFilterFactory
- */
+
+ /** Test PersianCharFilterFactory */
public void testPersianCharFilter() throws Exception {
Reader reader = charFilterFactory("Persian").create(new StringReader("میخورد"));
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
- assertTokenStreamContents(tokenizer, new String[] { "می", "خورد" });
+ assertTokenStreamContents(tokenizer, new String[] {"می", "خورد"});
}
-
+
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
- IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
- tokenFilterFactory("ArabicNormalization", "bogusArg", "bogusValue");
- });
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ tokenFilterFactory("ArabicNormalization", "bogusArg", "bogusValue");
+ });
assertTrue(expected.getMessage().contains("Unknown parameters"));
-
- expected = expectThrows(IllegalArgumentException.class, () -> {
- tokenFilterFactory("Arabicstem", "bogusArg", "bogusValue");
- });
+
+ expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ tokenFilterFactory("Arabicstem", "bogusArg", "bogusValue");
+ });
assertTrue(expected.getMessage().contains("Unknown parameters"));
-
- expected = expectThrows(IllegalArgumentException.class, () -> {
- charFilterFactory("Persian", "bogusArg", "bogusValue");
- });
+
+ expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ charFilterFactory("Persian", "bogusArg", "bogusValue");
+ });
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
index a040901..d833b30 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
@@ -16,33 +16,29 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
import java.io.StringReader;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-/**
- * Test the Arabic Normalization Filter
- */
+/** Test the Arabic Normalization Filter */
public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
public void testAlifMadda() throws IOException {
check("آجن", "اجن");
}
-
+
public void testAlifHamzaAbove() throws IOException {
check("أحمد", "احمد");
}
-
+
public void testAlifHamzaBelow() throws IOException {
check("إعاذ", "اعاذ");
}
-
+
public void testAlifMaksura() throws IOException {
check("بنى", "بني");
}
@@ -50,60 +46,60 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
public void testTehMarbuta() throws IOException {
check("فاطمة", "فاطمه");
}
-
+
public void testTatweel() throws IOException {
check("روبرـــــت", "روبرت");
}
-
+
public void testFatha() throws IOException {
check("مَبنا", "مبنا");
}
-
+
public void testKasra() throws IOException {
check("علِي", "علي");
}
-
+
public void testDamma() throws IOException {
check("بُوات", "بوات");
}
-
+
public void testFathatan() throws IOException {
check("ولداً", "ولدا");
}
-
+
public void testKasratan() throws IOException {
check("ولدٍ", "ولد");
}
-
+
public void testDammatan() throws IOException {
check("ولدٌ", "ولد");
- }
-
+ }
+
public void testSukun() throws IOException {
check("نلْسون", "نلسون");
}
-
+
public void testShaddah() throws IOException {
check("هتميّ", "هتمي");
- }
-
+ }
+
private void check(final String input, final String expected) throws IOException {
MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenStream.setReader(new StringReader(input));
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
- assertTokenStreamContents(filter, new String[]{expected});
+ assertTokenStreamContents(filter, new String[] {expected});
}
-
+
public void testEmptyTerm() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
- }
- };
+ Analyzer a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
+ }
+ };
checkOneTerm(a, "", "");
a.close();
}
-
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
index 872e7f5..403a1af 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
@@ -16,9 +16,7 @@
*/
package org.apache.lucene.analysis.ar;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
@@ -27,88 +25,85 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-/**
- * Test the Arabic Normalization Filter
- *
- */
+/** Test the Arabic Normalization Filter */
public class TestArabicStemFilter extends BaseTokenStreamTestCase {
-
+
public void testAlPrefix() throws IOException {
check("الحسن", "حسن");
- }
+ }
public void testWalPrefix() throws IOException {
check("والحسن", "حسن");
- }
-
+ }
+
public void testBalPrefix() throws IOException {
check("بالحسن", "حسن");
- }
-
+ }
+
public void testKalPrefix() throws IOException {
check("كالحسن", "حسن");
- }
-
+ }
+
public void testFalPrefix() throws IOException {
check("فالحسن", "حسن");
- }
+ }
public void testLlPrefix() throws IOException {
- check("للاخر", "اخر");
+ check("للاخر", "اخر");
}
-
+
public void testWaPrefix() throws IOException {
check("وحسن", "حسن");
- }
-
+ }
+
public void testAhSuffix() throws IOException {
check("زوجها", "زوج");
- }
-
+ }
+
public void testAnSuffix() throws IOException {
check("ساهدان", "ساهد");
- }
-
+ }
+
public void testAtSuffix() throws IOException {
check("ساهدات", "ساهد");
- }
-
+ }
+
public void testWnSuffix() throws IOException {
check("ساهدون", "ساهد");
- }
-
+ }
+
public void testYnSuffix() throws IOException {
check("ساهدين", "ساهد");
- }
-
+ }
+
public void testYhSuffix() throws IOException {
check("ساهديه", "ساهد");
- }
+ }
public void testYpSuffix() throws IOException {
check("ساهدية", "ساهد");
- }
-
+ }
+
public void testHSuffix() throws IOException {
check("ساهده", "ساهد");
- }
-
+ }
+
public void testPSuffix() throws IOException {
check("ساهدة", "ساهد");
}
-
+
public void testYSuffix() throws IOException {
check("ساهدي", "ساهد");
}
-
+
public void testComboPrefSuf() throws IOException {
check("وساهدون", "ساهد");
}
-
+
public void testComboSuf() throws IOException {
check("ساهدهات", "ساهد");
}
-
+
public void testShouldntStem() throws IOException {
check("الو", "الو");
}
@@ -116,30 +111,31 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
public void testNonArabic() throws IOException {
check("English", "English");
}
-
+
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("ساهدهات");
- MockTokenizer tokenStream = whitespaceMockTokenizer("ساهدهات");
+ MockTokenizer tokenStream = whitespaceMockTokenizer("ساهدهات");
ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
- assertTokenStreamContents(filter, new String[]{"ساهدهات"});
+ assertTokenStreamContents(filter, new String[] {"ساهدهات"});
}
private void check(final String input, final String expected) throws IOException {
- MockTokenizer tokenStream = whitespaceMockTokenizer(input);
+ MockTokenizer tokenStream = whitespaceMockTokenizer(input);
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
- assertTokenStreamContents(filter, new String[]{expected});
+ assertTokenStreamContents(filter, new String[] {expected});
}
-
+
public void testEmptyTerm() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
- }
- };
+ Analyzer a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
+ }
+ };
checkOneTerm(a, "", "");
a.close();
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java
index 50f3ab9..c62cee4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationDocValuesField.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.collation;
-
import java.text.Collator;
import java.util.Locale;
-
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
@@ -37,42 +35,41 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
-/**
- * trivial test of CollationDocValuesField
- */
+/** trivial test of CollationDocValuesField */
public class TestCollationDocValuesField extends LuceneTestCase {
-
+
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field field = newField("field", "", StringField.TYPE_STORED);
- CollationDocValuesField collationField = new CollationDocValuesField("collated", Collator.getInstance(Locale.ENGLISH));
+ CollationDocValuesField collationField =
+ new CollationDocValuesField("collated", Collator.getInstance(Locale.ENGLISH));
doc.add(field);
doc.add(collationField);
field.setStringValue("ABC");
collationField.setStringValue("ABC");
iw.addDocument(doc);
-
+
field.setStringValue("abc");
collationField.setStringValue("abc");
iw.addDocument(doc);
-
+
IndexReader ir = iw.getReader();
iw.close();
-
+
IndexSearcher is = newSearcher(ir);
-
+
SortField sortField = new SortField("collated", SortField.Type.STRING);
-
+
TopDocs td = is.search(new MatchAllDocsQuery(), 5, new Sort(sortField));
assertEquals("abc", ir.document(td.scoreDocs[0].doc).get("field"));
assertEquals("ABC", ir.document(td.scoreDocs[1].doc).get("field"));
ir.close();
dir.close();
}
-
+
public void testRanges() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
@@ -85,7 +82,7 @@ public class TestCollationDocValuesField extends LuceneTestCase {
CollationDocValuesField collationField = new CollationDocValuesField("collated", collator);
doc.add(field);
doc.add(collationField);
-
+
int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
String value = TestUtil.randomSimpleString(random());
@@ -93,13 +90,13 @@ public class TestCollationDocValuesField extends LuceneTestCase {
collationField.setStringValue(value);
iw.addDocument(doc);
}
-
+
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
-
+
int numChecks = atLeast(20);
-
+
try {
for (int i = 0; i < numChecks; i++) {
String start = TestUtil.randomSimpleString(random());
@@ -113,13 +110,21 @@ public class TestCollationDocValuesField extends LuceneTestCase {
dir.close();
}
}
-
- private void doTestRanges(IndexSearcher is, String startPoint, String endPoint, BytesRef startBR, BytesRef endBR, Collator collator) throws Exception {
+
+ private void doTestRanges(
+ IndexSearcher is,
+ String startPoint,
+ String endPoint,
+ BytesRef startBR,
+ BytesRef endBR,
+ Collator collator)
+ throws Exception {
SortedDocValues dvs = MultiDocValues.getSortedValues(is.getIndexReader(), "collated");
- for(int docID=0;docID<is.getIndexReader().maxDoc();docID++) {
+ for (int docID = 0; docID < is.getIndexReader().maxDoc(); docID++) {
Document doc = is.doc(docID);
String s = doc.getField("field").stringValue();
- boolean collatorAccepts = collate(collator, s, startPoint) >= 0 && collate(collator, s, endPoint) <= 0;
+ boolean collatorAccepts =
+ collate(collator, s, startPoint) >= 0 && collate(collator, s, endPoint) <= 0;
assertEquals(docID, dvs.nextDoc());
BytesRef br = dvs.binaryValue();
boolean luceneAccepts = br.compareTo(startBR) >= 0 && br.compareTo(endBR) <= 0;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
index b1a7fb2..efc3c2d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
@@ -16,56 +16,55 @@
*/
package org.apache.lucene.collation;
-
+import java.text.Collator;
+import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CollationTestBase;
import org.apache.lucene.util.BytesRef;
-import java.text.Collator;
-import java.util.Locale;
-
-public class TestCollationKeyAnalyzer extends CollationTestBase {
+public class TestCollationKeyAnalyzer extends CollationTestBase {
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer;
-
+
@Override
public void setUp() throws Exception {
super.setUp();
analyzer = new CollationKeyAnalyzer(collator);
}
-
+
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
- private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning =
+ new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+ private BytesRef firstRangeEnd =
+ new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+ private BytesRef secondRangeBeginning =
+ new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+ private BytesRef secondRangeEnd =
+ new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
public void testFarsiRangeFilterCollating() throws Exception {
- testFarsiRangeFilterCollating
- (analyzer, firstRangeBeginning, firstRangeEnd,
- secondRangeBeginning, secondRangeEnd);
+ testFarsiRangeFilterCollating(
+ analyzer, firstRangeBeginning, firstRangeEnd, secondRangeBeginning, secondRangeEnd);
}
-
+
public void testFarsiRangeQueryCollating() throws Exception {
- testFarsiRangeQueryCollating
- (analyzer, firstRangeBeginning, firstRangeEnd,
- secondRangeBeginning, secondRangeEnd);
+ testFarsiRangeQueryCollating(
+ analyzer, firstRangeBeginning, firstRangeEnd, secondRangeBeginning, secondRangeEnd);
}
public void testFarsiTermRangeQuery() throws Exception {
- testFarsiTermRangeQuery
- (analyzer, firstRangeBeginning, firstRangeEnd,
- secondRangeBeginning, secondRangeEnd);
+ testFarsiTermRangeQuery(
+ analyzer, firstRangeBeginning, firstRangeEnd, secondRangeBeginning, secondRangeEnd);
}
-
+
public void testThreadSafe() throws Exception {
int iters = 20 * RANDOM_MULTIPLIER;
for (int i = 0; i < iters; i++) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index 3366611..ef3d5c2 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.standard;
-
import java.io.IOException;
import java.io.Reader;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
@@ -28,51 +26,57 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
/**
- * Filters {@link StandardTokenizer} with {@link LowerCaseFilter} and
- * {@link StopFilter}, using a configurable list of stop words.
+ * Filters {@link StandardTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}, using a
+ * configurable list of stop words.
*
* @since 3.1
*/
public final class StandardAnalyzer extends StopwordAnalyzerBase {
-
+
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
- /** Builds an analyzer with the given stop words.
- * @param stopWords stop words */
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopWords stop words
+ */
public StandardAnalyzer(CharArraySet stopWords) {
super(stopWords);
}
- /** Builds an analyzer with no stop words.
- */
+ /** Builds an analyzer with no stop words. */
public StandardAnalyzer() {
this(CharArraySet.EMPTY_SET);
}
- /** Builds an analyzer with the stop words from the given reader.
+ /**
+ * Builds an analyzer with the stop words from the given reader.
+ *
* @see WordlistLoader#getWordSet(Reader)
- * @param stopwords Reader to read stop words from */
+ * @param stopwords Reader to read stop words from
+ */
public StandardAnalyzer(Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords));
}
/**
- * Set the max allowed token length. Tokens larger than this will be chopped
- * up at this token length and emitted as multiple tokens. If you need to
- * skip such large tokens, you could increase this max length, and then
- * use {@code LengthFilter} to remove long tokens. The default is
- * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+ * Set the max allowed token length. Tokens larger than this will be chopped up at this token
+ * length and emitted as multiple tokens. If you need to skip such large tokens, you could
+ * increase this max length, and then use {@code LengthFilter} to remove long tokens. The default
+ * is {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
-
- /** Returns the current maximum token length
- *
- * @see #setMaxTokenLength */
+
+ /**
+ * Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength
+ */
public int getMaxTokenLength() {
return maxTokenLength;
}
@@ -83,10 +87,12 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new LowerCaseFilter(src);
tok = new StopFilter(tok, stopwords);
- return new TokenStreamComponents(r -> {
- src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
- src.setReader(r);
- }, tok);
+ return new TokenStreamComponents(
+ r -> {
+ src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
+ src.setReader(r);
+ },
+ tok);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 50d1f9f..9cd7fbc 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -18,7 +18,6 @@
package org.apache.lucene.analysis.standard;
import java.io.IOException;
-
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -26,16 +25,16 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
-/** A grammar-based tokenizer constructed with JFlex.
- * <p>
- * This class implements the Word Break rules from the
- * Unicode Text Segmentation algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
+/**
+ * A grammar-based tokenizer constructed with JFlex.
+ *
+ * <p>This class implements the Word Break rules from the Unicode Text Segmentation algorithm, as
+ * specified in <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ *
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does not suit your
+ * application, please consider copying this source code directory to your project and maintaining
+ * your own grammar-based tokenizer.
*/
-
public final class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private StandardTokenizerImpl scanner;
@@ -56,36 +55,36 @@ public final class StandardTokenizer extends Tokenizer {
public static final int HANGUL = 6;
/** Emoji token type. */
public static final int EMOJI = 7;
-
+
/** String token types that correspond to token type int constants */
- public static final String [] TOKEN_TYPES = new String [] {
- "<ALPHANUM>",
- "<NUM>",
- "<SOUTHEAST_ASIAN>",
- "<IDEOGRAPHIC>",
- "<HIRAGANA>",
- "<KATAKANA>",
- "<HANGUL>",
- "<EMOJI>"
- };
-
+ public static final String[] TOKEN_TYPES =
+ new String[] {
+ "<ALPHANUM>",
+ "<NUM>",
+ "<SOUTHEAST_ASIAN>",
+ "<IDEOGRAPHIC>",
+ "<HIRAGANA>",
+ "<KATAKANA>",
+ "<HANGUL>",
+ "<EMOJI>"
+ };
+
/** Absolute maximum sized token */
public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
-
+
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/**
- * Set the max allowed token length. Tokens larger than this will be chopped
- * up at this token length and emitted as multiple tokens. If you need to
- * skip such large tokens, you could increase this max length, and then
- * use {@code LengthFilter} to remove long tokens. The default is
- * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
- *
- * @throws IllegalArgumentException if the given length is outside of the
- * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
- */
+ * Set the max allowed token length. Tokens larger than this will be chopped up at this token
+ * length and emitted as multiple tokens. If you need to skip such large tokens, you could
+ * increase this max length, and then use {@code LengthFilter} to remove long tokens. The default
+ * is {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+ *
+ * @throws IllegalArgumentException if the given length is outside of the range [1, {@value
+ * #MAX_TOKEN_LENGTH_LIMIT}].
+ */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
@@ -98,25 +97,27 @@ public final class StandardTokenizer extends Tokenizer {
}
}
- /** Returns the current maximum token length
- *
- * @see #setMaxTokenLength */
+ /**
+ * Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength
+ */
public int getMaxTokenLength() {
return maxTokenLength;
}
/**
- * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
- * the <code>input</code> to the newly created JFlex scanner.
-
- * See http://issues.apache.org/jira/browse/LUCENE-1068
+ * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}.
+ * Attaches the <code>input</code> to the newly created JFlex scanner.
+ *
+ * <p>See http://issues.apache.org/jira/browse/LUCENE-1068
*/
public StandardTokenizer() {
init();
}
/**
- * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
+ * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
*/
public StandardTokenizer(AttributeFactory factory) {
super(factory);
@@ -131,7 +132,8 @@ public final class StandardTokenizer extends Tokenizer {
// term offset, positionIncrement and type
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt =
+ addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
/*
@@ -144,7 +146,7 @@ public final class StandardTokenizer extends Tokenizer {
clearAttributes();
skippedPositions = 0;
- while(true) {
+ while (true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
@@ -152,10 +154,10 @@ public final class StandardTokenizer extends Tokenizer {
}
if (scanner.yylength() <= maxTokenLength) {
- posIncrAtt.setPositionIncrement(skippedPositions+1);
+ posIncrAtt.setPositionIncrement(skippedPositions + 1);
scanner.getText(termAtt);
final int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
return true;
} else
@@ -164,7 +166,7 @@ public final class StandardTokenizer extends Tokenizer {
skippedPositions++;
}
}
-
+
@Override
public final void end() throws IOException {
super.end();
@@ -172,7 +174,7 @@ public final class StandardTokenizer extends Tokenizer {
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
// adjust any skipped tokens
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java
index cfcf74c..ea8273a 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java
@@ -16,20 +16,19 @@
*/
package org.apache.lucene.analysis.standard;
-
+import java.util.Map;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
-import java.util.Map;
-
/**
- * Factory for {@link StandardTokenizer}.
+ * Factory for {@link StandardTokenizer}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/>
* </analyzer>
- * </fieldType></pre>
+ * </fieldType></pre>
*
* @since 3.1
* @lucene.spi {@value #NAME}
@@ -40,9 +39,9 @@ public class StandardTokenizerFactory extends TokenizerFactory {
public static final String NAME = "standard";
private final int maxTokenLength;
-
+
/** Creates a new StandardTokenizerFactory */
- public StandardTokenizerFactory(Map<String,String> args) {
+ public StandardTokenizerFactory(Map<String, String> args) {
super(args);
maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
if (!args.isEmpty()) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
index ff313a3..aa13349 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
@@ -16,17 +16,16 @@
*/
/**
- * Fast, general-purpose grammar-based tokenizer {@link org.apache.lucene.analysis.standard.StandardTokenizer}
- * implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
- * Unlike <code>UAX29URLEmailTokenizer</code> from the analysis module, URLs and email addresses are
- * <b>not</b> tokenized as single tokens, but are instead split up into
- * tokens according to the UAX#29 word break rules.
- * <br>
- * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes
- * {@link org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer},
- * {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
- * and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ * Fast, general-purpose grammar-based tokenizer {@link
+ * org.apache.lucene.analysis.standard.StandardTokenizer} implements the Word Break rules from the
+ * Unicode Text Segmentation algorithm, as specified in <a
+ * href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>. Unlike <code>
+ * UAX29URLEmailTokenizer</code> from the analysis module, URLs and email addresses are <b>not</b>
+ * tokenized as single tokens, but are instead split up into tokens according to the UAX#29 word
+ * break rules. <br>
+ * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes {@link
+ * org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer}, {@link
+ * org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter} and {@link
+ * org.apache.lucene.analysis.StopFilter StopFilter}.
*/
-
package org.apache.lucene.analysis.standard;
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttribute.java
index 4c65086..227aa68 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttribute.java
@@ -16,16 +16,15 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.BytesRef;
/**
- * This attribute can be used if you have the raw term bytes to be indexed.
- * It can be used as replacement for {@link CharTermAttribute}, if binary
- * terms should be indexed.
+ * This attribute can be used if you have the raw term bytes to be indexed. It can be used as
+ * replacement for {@link CharTermAttribute}, if binary terms should be indexed.
+ *
* @lucene.internal
*/
public interface BytesTermAttribute extends TermToBytesRefAttribute {
/** Sets the {@link BytesRef} of the term */
public void setBytesRef(BytesRef bytes);
-}
\ No newline at end of file
+}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttributeImpl.java
index 6cebb9d..8783bbf 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/BytesTermAttributeImpl.java
@@ -16,17 +16,18 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import java.util.Objects;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
-/** Implementation class for {@link BytesTermAttribute}.
+/**
+ * Implementation class for {@link BytesTermAttribute}.
+ *
* @lucene.internal
*/
-public class BytesTermAttributeImpl extends AttributeImpl implements BytesTermAttribute, TermToBytesRefAttribute {
+public class BytesTermAttributeImpl extends AttributeImpl
+ implements BytesTermAttribute, TermToBytesRefAttribute {
private BytesRef bytes;
/** Initialize this attribute with no bytes. */
@@ -55,7 +56,7 @@ public class BytesTermAttributeImpl extends AttributeImpl implements BytesTermAt
@Override
public AttributeImpl clone() {
- BytesTermAttributeImpl c = (BytesTermAttributeImpl)super.clone();
+ BytesTermAttributeImpl c = (BytesTermAttributeImpl) super.clone();
copyTo(c);
return c;
}
@@ -77,4 +78,4 @@ public class BytesTermAttributeImpl extends AttributeImpl implements BytesTermAt
public int hashCode() {
return Objects.hash(bytes);
}
-}
\ No newline at end of file
+}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java
index 5e10bd7..f018ed5 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java
@@ -16,83 +16,88 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
-/**
- * The term text of a Token.
- */
+/** The term text of a Token. */
public interface CharTermAttribute extends Attribute, CharSequence, Appendable {
-
- /** Copies the contents of buffer, starting at offset for
- * length characters, into the termBuffer array.
- * @param buffer the buffer to copy
- * @param offset the index in the buffer of the first character to copy
- * @param length the number of characters to copy
+
+ /**
+ * Copies the contents of buffer, starting at offset for length characters, into the termBuffer
+ * array.
+ *
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
*/
public void copyBuffer(char[] buffer, int offset, int length);
-
- /** Returns the internal termBuffer character array which
- * you can then directly alter. If the array is too
- * small for your token, use {@link
- * #resizeBuffer(int)} to increase it. After
- * altering the buffer be sure to call {@link
- * #setLength} to record the number of valid
- * characters that were placed into the termBuffer.
- * <p>
- * <b>NOTE</b>: The returned buffer may be larger than
- * the valid {@link #length()}.
+
+ /**
+ * Returns the internal termBuffer character array which you can then directly alter. If the array
+ * is too small for your token, use {@link #resizeBuffer(int)} to increase it. After altering the
+ * buffer be sure to call {@link #setLength} to record the number of valid characters that were
+ * placed into the termBuffer.
+ *
+ * <p><b>NOTE</b>: The returned buffer may be larger than the valid {@link #length()}.
*/
public char[] buffer();
- /** Grows the termBuffer to at least size newSize, preserving the
- * existing content.
- * @param newSize minimum size of the new termBuffer
- * @return newly created termBuffer with {@code length >= newSize}
+ /**
+ * Grows the termBuffer to at least size newSize, preserving the existing content.
+ *
+ * @param newSize minimum size of the new termBuffer
+ * @return newly created termBuffer with {@code length >= newSize}
*/
public char[] resizeBuffer(int newSize);
- /** Set number of valid characters (length of the term) in
- * the termBuffer array. Use this to truncate the termBuffer
- * or to synchronize with external manipulation of the termBuffer.
- * Note: to grow the size of the array,
- * use {@link #resizeBuffer(int)} first.
- * @param length the truncated length
+ /**
+ * Set number of valid characters (length of the term) in the termBuffer array. Use this to
+ * truncate the termBuffer or to synchronize with external manipulation of the termBuffer. Note:
+ * to grow the size of the array, use {@link #resizeBuffer(int)} first.
+ *
+ * @param length the truncated length
*/
public CharTermAttribute setLength(int length);
-
- /** Sets the length of the termBuffer to zero.
- * Use this method before appending contents
- * using the {@link Appendable} interface.
+
+ /**
+ * Sets the length of the termBuffer to zero. Use this method before appending contents using the
+ * {@link Appendable} interface.
*/
public CharTermAttribute setEmpty();
-
+
// the following methods are redefined to get rid of IOException declaration:
@Override
public CharTermAttribute append(CharSequence csq);
+
@Override
public CharTermAttribute append(CharSequence csq, int start, int end);
+
@Override
public CharTermAttribute append(char c);
- /** Appends the specified {@code String} to this character sequence.
- * <p>The characters of the {@code String} argument are appended, in order, increasing the length of
- * this sequence by the length of the argument. If argument is {@code null}, then the four
- * characters {@code "null"} are appended.
+ /**
+ * Appends the specified {@code String} to this character sequence.
+ *
+ * <p>The characters of the {@code String} argument are appended, in order, increasing the length
+ * of this sequence by the length of the argument. If argument is {@code null}, then the four
+ * characters {@code "null"} are appended.
*/
public CharTermAttribute append(String s);
- /** Appends the specified {@code StringBuilder} to this character sequence.
- * <p>The characters of the {@code StringBuilder} argument are appended, in order, increasing the length of
- * this sequence by the length of the argument. If argument is {@code null}, then the four
- * characters {@code "null"} are appended.
+ /**
+ * Appends the specified {@code StringBuilder} to this character sequence.
+ *
+ * <p>The characters of the {@code StringBuilder} argument are appended, in order, increasing the
+ * length of this sequence by the length of the argument. If argument is {@code null}, then the
+ * four characters {@code "null"} are appended.
*/
public CharTermAttribute append(StringBuilder sb);
- /** Appends the contents of the other {@code CharTermAttribute} to this character sequence.
- * <p>The characters of the {@code CharTermAttribute} argument are appended, in order, increasing the length of
- * this sequence by the length of the argument. If argument is {@code null}, then the four
- * characters {@code "null"} are appended.
+ /**
+ * Appends the contents of the other {@code CharTermAttribute} to this character sequence.
+ *
+ * <p>The characters of the {@code CharTermAttribute} argument are appended, in order, increasing
+ * the length of this sequence by the length of the argument. If argument is {@code null}, then
+ * the four characters {@code "null"} are appended.
*/
public CharTermAttribute append(CharTermAttribute termAtt);
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
index 9d6166d..bae5c60 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.tokenattributes;
import java.nio.CharBuffer;
import java.util.Objects;
-
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
@@ -26,15 +25,19 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/** Default implementation of {@link CharTermAttribute}. */
-public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttribute, TermToBytesRefAttribute, Cloneable {
+public class CharTermAttributeImpl extends AttributeImpl
+ implements CharTermAttribute, TermToBytesRefAttribute, Cloneable {
private static int MIN_BUFFER_SIZE = 10;
-
+
private char[] termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, Character.BYTES)];
private int termLength = 0;
-
- /** May be used by subclasses to convert to different charsets / encodings for implementing {@link #getBytesRef()}. */
+
+ /**
+ * May be used by subclasses to convert to different charsets / encodings for implementing {@link
+ * #getBytesRef()}.
+ */
protected BytesRefBuilder builder = new BytesRefBuilder();
-
+
/** Initialize this attribute with empty term text */
public CharTermAttributeImpl() {}
@@ -49,21 +52,21 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
public final char[] buffer() {
return termBuffer;
}
-
+
@Override
public final char[] resizeBuffer(int newSize) {
- if(termBuffer.length < newSize){
+ if (termBuffer.length < newSize) {
// Not big enough; create a new array with slight
// over allocation and preserve content
final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, Character.BYTES)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
- return termBuffer;
+ return termBuffer;
}
-
+
private void growTermBuffer(int newSize) {
- if(termBuffer.length < newSize){
+ if (termBuffer.length < newSize) {
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.oversize(newSize, Character.BYTES)];
@@ -76,13 +79,13 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
termLength = length;
return this;
}
-
+
@Override
public final CharTermAttribute setEmpty() {
termLength = 0;
return this;
}
-
+
// *** TermToBytesRefAttribute interface ***
@Override
@@ -90,43 +93,42 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
builder.copyChars(termBuffer, 0, termLength);
return builder.get();
}
-
+
// *** CharSequence interface ***
@Override
public final int length() {
return termLength;
}
-
+
@Override
public final char charAt(int index) {
Objects.checkIndex(index, termLength);
return termBuffer[index];
}
-
+
@Override
public final CharSequence subSequence(final int start, final int end) {
Objects.checkFromToIndex(start, end, termLength);
return new String(termBuffer, start, end - start);
}
-
+
// *** Appendable interface ***
@Override
public final CharTermAttribute append(CharSequence csq) {
if (csq == null) // needed for Appendable compliance
- return appendNull();
+ return appendNull();
return append(csq, 0, csq.length());
}
-
+
@Override
public final CharTermAttribute append(CharSequence csq, int start, int end) {
if (csq == null) // needed for Appendable compliance
- csq = "null";
+ csq = "null";
// TODO: the optimized cases (jdk methods) will already do such checks, maybe re-organize this?
Objects.checkFromToIndex(start, end, csq.length());
final int len = end - start;
- if (len == 0)
- return this;
+ if (len == 0) return this;
resizeBuffer(termLength + len);
if (len > 4) { // only use instanceof check series for longer CSQs, else simply iterate
if (csq instanceof String) {
@@ -137,56 +139,55 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
System.arraycopy(((CharTermAttribute) csq).buffer(), start, termBuffer, termLength, len);
} else if (csq instanceof CharBuffer && ((CharBuffer) csq).hasArray()) {
final CharBuffer cb = (CharBuffer) csq;
- System.arraycopy(cb.array(), cb.arrayOffset() + cb.position() + start, termBuffer, termLength, len);
+ System.arraycopy(
+ cb.array(), cb.arrayOffset() + cb.position() + start, termBuffer, termLength, len);
} else if (csq instanceof StringBuffer) {
((StringBuffer) csq).getChars(start, end, termBuffer, termLength);
} else {
- while (start < end)
- termBuffer[termLength++] = csq.charAt(start++);
+ while (start < end) termBuffer[termLength++] = csq.charAt(start++);
// no fall-through here, as termLength is updated!
return this;
}
termLength += len;
return this;
} else {
- while (start < end)
- termBuffer[termLength++] = csq.charAt(start++);
+ while (start < end) termBuffer[termLength++] = csq.charAt(start++);
return this;
}
}
-
+
@Override
public final CharTermAttribute append(char c) {
resizeBuffer(termLength + 1)[termLength++] = c;
return this;
}
-
+
// *** For performance some convenience methods in addition to CSQ's ***
-
+
@Override
public final CharTermAttribute append(String s) {
if (s == null) // needed for Appendable compliance
- return appendNull();
+ return appendNull();
final int len = s.length();
s.getChars(0, len, resizeBuffer(termLength + len), termLength);
termLength += len;
return this;
}
-
+
@Override
public final CharTermAttribute append(StringBuilder s) {
if (s == null) // needed for Appendable compliance
- return appendNull();
+ return appendNull();
final int len = s.length();
s.getChars(0, len, resizeBuffer(termLength + len), termLength);
termLength += len;
return this;
}
-
+
@Override
public final CharTermAttribute append(CharTermAttribute ta) {
if (ta == null) // needed for Appendable compliance
- return appendNull();
+ return appendNull();
final int len = ta.length();
System.arraycopy(ta.buffer(), 0, resizeBuffer(termLength + len), termLength, len);
termLength += len;
@@ -201,7 +202,7 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
termBuffer[termLength++] = 'l';
return this;
}
-
+
// *** AttributeImpl ***
@Override
@@ -213,12 +214,12 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
@Override
public void clear() {
- termLength = 0;
+ termLength = 0;
}
@Override
public CharTermAttributeImpl clone() {
- CharTermAttributeImpl t = (CharTermAttributeImpl)super.clone();
+ CharTermAttributeImpl t = (CharTermAttributeImpl) super.clone();
// Do a deep clone
t.termBuffer = new char[this.termLength];
System.arraycopy(this.termBuffer, 0, t.termBuffer, 0, this.termLength);
@@ -226,47 +227,42 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
t.builder.copyBytes(builder.get());
return t;
}
-
+
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
-
+
if (other instanceof CharTermAttributeImpl) {
final CharTermAttributeImpl o = ((CharTermAttributeImpl) other);
- if (termLength != o.termLength)
- return false;
- for(int i=0;i<termLength;i++) {
+ if (termLength != o.termLength) return false;
+ for (int i = 0; i < termLength; i++) {
if (termBuffer[i] != o.termBuffer[i]) {
return false;
}
}
return true;
}
-
+
return false;
}
- /**
- * Returns solely the term text as specified by the
- * {@link CharSequence} interface.
- */
+ /** Returns solely the term text as specified by the {@link CharSequence} interface. */
@Override
public String toString() {
return new String(termBuffer, 0, termLength);
}
-
+
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(CharTermAttribute.class, "term", toString());
reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
}
-
+
@Override
public void copyTo(AttributeImpl target) {
CharTermAttribute t = (CharTermAttribute) target;
t.copyBuffer(termBuffer, 0, termLength);
}
-
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
index 842e47d..e30f807 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
@@ -16,22 +16,23 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
/**
- * This attribute can be used to pass different flags down the {@link Tokenizer} chain,
- * e.g. from one TokenFilter to another one.
- * <p>
- * This is completely distinct from {@link TypeAttribute}, although they do share similar purposes.
- * The flags can be used to encode information about the token for use by other
- * {@link org.apache.lucene.analysis.TokenFilter}s.
+ * This attribute can be used to pass different flags down the {@link Tokenizer} chain, e.g. from
+ * one TokenFilter to another one.
+ *
+ * <p>This is completely distinct from {@link TypeAttribute}, although they do share similar
+ * purposes. The flags can be used to encode information about the token for use by other {@link
+ * org.apache.lucene.analysis.TokenFilter}s.
+ *
* @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
*/
public interface FlagsAttribute extends Attribute {
/**
- * Get the bitset for any bits that have been set.
+ * Get the bitset for any bits that have been set.
+ *
* @return The bits
* @see #getFlags()
*/
@@ -39,7 +40,8 @@ public interface FlagsAttribute extends Attribute {
/**
* Set the flags to a new bitset.
+ *
* @see #getFlags()
*/
- public void setFlags(int flags);
+ public void setFlags(int flags);
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
index 208a32d..9a1bdc3 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
@@ -16,17 +16,16 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link FlagsAttribute}. */
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable {
private int flags = 0;
-
+
/** Initialize this attribute with no bits set */
public FlagsAttributeImpl() {}
-
+
@Override
public int getFlags() {
return flags;
@@ -36,7 +35,7 @@ public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute,
public void setFlags(int flags) {
this.flags = flags;
}
-
+
@Override
public void clear() {
flags = 0;
@@ -47,11 +46,11 @@ public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute,
if (this == other) {
return true;
}
-
+
if (other instanceof FlagsAttributeImpl) {
return ((FlagsAttributeImpl) other).flags == flags;
}
-
+
return false;
}
@@ -59,7 +58,7 @@ public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute,
public int hashCode() {
return flags;
}
-
+
@Override
public void copyTo(AttributeImpl target) {
FlagsAttribute t = (FlagsAttribute) target;
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
index 8b99bc2..0cee55b 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
@@ -16,35 +16,30 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Attribute;
/**
- * This attribute can be used to mark a token as a keyword. Keyword aware
- * {@link TokenStream}s can decide to modify a token based on the return value
- * of {@link #isKeyword()} if the token is modified. Stemming filters for
- * instance can use this attribute to conditionally skip a term if
+ * This attribute can be used to mark a token as a keyword. Keyword aware {@link TokenStream}s can
+ * decide to modify a token based on the return value of {@link #isKeyword()} if the token is
+ * modified. Stemming filters for instance can use this attribute to conditionally skip a term if
* {@link #isKeyword()} returns <code>true</code>.
*/
public interface KeywordAttribute extends Attribute {
/**
- * Returns <code>true</code> if the current token is a keyword, otherwise
- * <code>false</code>
- *
- * @return <code>true</code> if the current token is a keyword, otherwise
- * <code>false</code>
+ * Returns <code>true</code> if the current token is a keyword, otherwise <code>false</code>
+ *
+ * @return <code>true</code> if the current token is a keyword, otherwise <code>false</code>
* @see #setKeyword(boolean)
*/
public boolean isKeyword();
/**
* Marks the current token as keyword if set to <code>true</code>.
- *
- * @param isKeyword
- * <code>true</code> if the current token is a keyword, otherwise
- * <code>false</code>.
+ *
+ * @param isKeyword <code>true</code> if the current token is a keyword, otherwise <code>false
+ * </code>.
* @see #isKeyword()
*/
public void setKeyword(boolean isKeyword);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
index bda62bb..ae0dbaa 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
@@ -16,15 +16,13 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link KeywordAttribute}. */
-public final class KeywordAttributeImpl extends AttributeImpl implements
- KeywordAttribute {
+public final class KeywordAttributeImpl extends AttributeImpl implements KeywordAttribute {
private boolean keyword;
-
+
/** Initialize this attribute with the keyword value as false. */
public KeywordAttributeImpl() {}
@@ -46,10 +44,8 @@ public final class KeywordAttributeImpl extends AttributeImpl implements
@Override
public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (getClass() != obj.getClass())
- return false;
+ if (this == obj) return true;
+ if (getClass() != obj.getClass()) return false;
final KeywordAttributeImpl other = (KeywordAttributeImpl) obj;
return keyword == other.keyword;
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
index 1153448..0c68459 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
@@ -16,40 +16,37 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
-/**
- * The start and end character offset of a Token.
- */
+/** The start and end character offset of a Token. */
public interface OffsetAttribute extends Attribute {
- /**
- * Returns this Token's starting offset, the position of the first character
- * corresponding to this token in the source text.
- * <p>
- * Note that the difference between {@link #endOffset()} and <code>startOffset()</code>
- * may not be equal to termText.length(), as the term text may have been altered by a
- * stemmer or some other filter.
- * @see #setOffset(int, int)
+ /**
+ * Returns this Token's starting offset, the position of the first character corresponding to this
+ * token in the source text.
+ *
+ * <p>Note that the difference between {@link #endOffset()} and <code>startOffset()</code> may not
+ * be equal to termText.length(), as the term text may have been altered by a stemmer or some
+ * other filter.
+ *
+ * @see #setOffset(int, int)
*/
public int startOffset();
-
- /**
+ /**
* Set the starting and ending offset.
- * @throws IllegalArgumentException If <code>startOffset</code> or <code>endOffset</code>
- * are negative, or if <code>startOffset</code> is greater than
- * <code>endOffset</code>
+ *
+ * @throws IllegalArgumentException If <code>startOffset</code> or <code>endOffset</code> are
+ * negative, or if <code>startOffset</code> is greater than <code>endOffset</code>
* @see #startOffset()
* @see #endOffset()
*/
public void setOffset(int startOffset, int endOffset);
-
- /**
- * Returns this Token's ending offset, one greater than the position of the
- * last character corresponding to this token in the source text. The length
- * of the token in the source text is (<code>endOffset()</code> - {@link #startOffset()}).
+ /**
+ * Returns this Token's ending offset, one greater than the position of the last character
+ * corresponding to this token in the source text. The length of the token in the source text is (
+ * <code>endOffset()</code> - {@link #startOffset()}).
+ *
* @see #setOffset(int, int)
*/
public int endOffset();
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
index 166d6b2..8ddae00 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
@@ -24,7 +23,7 @@ import org.apache.lucene.util.AttributeReflector;
public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable {
private int startOffset;
private int endOffset;
-
+
/** Initialize this attribute with startOffset and endOffset of 0. */
public OffsetAttributeImpl() {}
@@ -43,20 +42,23 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
// OffsetAtt
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
- + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+ throw new IllegalArgumentException(
+ "startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ + "startOffset="
+ + startOffset
+ + ",endOffset="
+ + endOffset);
}
this.startOffset = startOffset;
this.endOffset = endOffset;
}
-
+
@Override
public int endOffset() {
return endOffset;
}
-
@Override
public void clear() {
// TODO: we could use -1 as default here? Then we can
@@ -64,18 +66,18 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
startOffset = 0;
endOffset = 0;
}
-
+
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
-
+
if (other instanceof OffsetAttributeImpl) {
OffsetAttributeImpl o = (OffsetAttributeImpl) other;
return o.startOffset == startOffset && o.endOffset == endOffset;
}
-
+
return false;
}
@@ -84,13 +86,13 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
int code = startOffset;
code = code * 31 + endOffset;
return code;
- }
-
+ }
+
@Override
public void copyTo(AttributeImpl target) {
OffsetAttribute t = (OffsetAttribute) target;
t.setOffset(startOffset, endOffset);
- }
+ }
@Override
public void reflectWith(AttributeReflector reflector) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
index d057397..296af82 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
@@ -16,35 +16,40 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
-/** Default implementation of the common attributes used by Lucene:<ul>
- * <li>{@link CharTermAttribute}
- * <li>{@link TypeAttribute}
- * <li>{@link PositionIncrementAttribute}
- * <li>{@link PositionLengthAttribute}
- * <li>{@link OffsetAttribute}
- * <li>{@link TermFrequencyAttribute}
- * </ul>*/
-public class PackedTokenAttributeImpl extends CharTermAttributeImpl
- implements TypeAttribute, PositionIncrementAttribute,
- PositionLengthAttribute, OffsetAttribute,
- TermFrequencyAttribute {
-
- private int startOffset,endOffset;
+/**
+ * Default implementation of the common attributes used by Lucene:
+ *
+ * <ul>
+ * <li>{@link CharTermAttribute}
+ * <li>{@link TypeAttribute}
+ * <li>{@link PositionIncrementAttribute}
+ * <li>{@link PositionLengthAttribute}
+ * <li>{@link OffsetAttribute}
+ * <li>{@link TermFrequencyAttribute}
+ * </ul>
+ */
+public class PackedTokenAttributeImpl extends CharTermAttributeImpl
+ implements TypeAttribute,
+ PositionIncrementAttribute,
+ PositionLengthAttribute,
+ OffsetAttribute,
+ TermFrequencyAttribute {
+
+ private int startOffset, endOffset;
private String type = DEFAULT_TYPE;
private int positionIncrement = 1;
private int positionLength = 1;
private int termFrequency = 1;
/** Constructs the attribute implementation. */
- public PackedTokenAttributeImpl() {
- }
+ public PackedTokenAttributeImpl() {}
/**
* {@inheritDoc}
+ *
* @see PositionIncrementAttribute
*/
@Override
@@ -57,6 +62,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see PositionIncrementAttribute
*/
@Override
@@ -66,18 +72,21 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see PositionLengthAttribute
*/
@Override
public void setPositionLength(int positionLength) {
if (positionLength < 1) {
- throw new IllegalArgumentException("Position length must be 1 or greater: got " + positionLength);
+ throw new IllegalArgumentException(
+ "Position length must be 1 or greater: got " + positionLength);
}
this.positionLength = positionLength;
}
/**
* {@inheritDoc}
+ *
* @see PositionLengthAttribute
*/
@Override
@@ -87,6 +96,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see OffsetAttribute
*/
@Override
@@ -96,6 +106,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see OffsetAttribute
*/
@Override
@@ -105,13 +116,18 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see OffsetAttribute
*/
@Override
public void setOffset(int startOffset, int endOffset) {
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
- + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+ throw new IllegalArgumentException(
+ "startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ + "startOffset="
+ + startOffset
+ + ",endOffset="
+ + endOffset);
}
this.startOffset = startOffset;
this.endOffset = endOffset;
@@ -119,6 +135,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see TypeAttribute
*/
@Override
@@ -128,6 +145,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
/**
* {@inheritDoc}
+ *
* @see TypeAttribute
*/
@Override
@@ -138,7 +156,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
@Override
public final void setTermFrequency(int termFrequency) {
if (termFrequency < 1) {
- throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
+ throw new IllegalArgumentException(
+ "Term frequency must be 1 or greater; got " + termFrequency);
}
this.termFrequency = termFrequency;
}
@@ -148,8 +167,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
return termFrequency;
}
- /** Resets the attributes
- */
+ /** Resets the attributes */
@Override
public void clear() {
super.clear();
@@ -158,9 +176,8 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
-
- /** Resets the attributes at end
- */
+
+ /** Resets the attributes at end */
@Override
public void end() {
super.end();
@@ -175,21 +192,18 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
@Override
public boolean equals(Object obj) {
- if (obj == this)
- return true;
+ if (obj == this) return true;
if (obj instanceof PackedTokenAttributeImpl) {
final PackedTokenAttributeImpl other = (PackedTokenAttributeImpl) obj;
- return (startOffset == other.startOffset &&
- endOffset == other.endOffset &&
- positionIncrement == other.positionIncrement &&
- positionLength == other.positionLength &&
- (type == null ? other.type == null : type.equals(other.type)) &&
- termFrequency == other.termFrequency &&
- super.equals(obj)
- );
- } else
- return false;
+ return (startOffset == other.startOffset
+ && endOffset == other.endOffset
+ && positionIncrement == other.positionIncrement
+ && positionLength == other.positionLength
+ && (type == null ? other.type == null : type.equals(other.type))
+ && termFrequency == other.termFrequency
+ && super.equals(obj));
+ } else return false;
}
@Override
@@ -199,8 +213,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
code = code * 31 + endOffset;
code = code * 31 + positionIncrement;
code = code * 31 + positionLength;
- if (type != null)
- code = code * 31 + type.hashCode();
+ if (type != null) code = code * 31 + type.hashCode();
code = code * 31 + termFrequency;
return code;
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
index 73ecdeb..2c45dd6 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
@@ -16,31 +16,32 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.BytesRef;
/**
* The payload of a Token.
- * <p>
- * The payload is stored in the index at each position, and can
- * be used to influence scoring when using Payload-based queries.
- * <p>
- * NOTE: because the payload will be stored at each position, it's usually
- * best to use the minimum number of bytes necessary. Some codec implementations
- * may optimize payload storage when all payloads have the same length.
- *
+ *
+ * <p>The payload is stored in the index at each position, and can be used to influence scoring when
+ * using Payload-based queries.
+ *
+ * <p>NOTE: because the payload will be stored at each position, it's usually best to use the
+ * minimum number of bytes necessary. Some codec implementations may optimize payload storage when
+ * all payloads have the same length.
+ *
* @see org.apache.lucene.index.PostingsEnum
*/
public interface PayloadAttribute extends Attribute {
/**
* Returns this Token's payload.
+ *
* @see #setPayload(BytesRef)
- */
+ */
public BytesRef getPayload();
- /**
+ /**
* Sets this Token's payload.
+ *
* @see #getPayload()
*/
public void setPayload(BytesRef payload);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java
index 5eaed0c..da36ebd 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java
@@ -16,27 +16,22 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
/** Default implementation of {@link PayloadAttribute}. */
public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable {
- private BytesRef payload;
-
- /**
- * Initialize this attribute with no payload.
- */
+ private BytesRef payload;
+
+ /** Initialize this attribute with no payload. */
public PayloadAttributeImpl() {}
-
- /**
- * Initialize this attribute with the given payload.
- */
+
+ /** Initialize this attribute with the given payload. */
public PayloadAttributeImpl(BytesRef payload) {
this.payload = payload;
}
-
+
@Override
public BytesRef getPayload() {
return this.payload;
@@ -46,14 +41,14 @@ public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttrib
public void setPayload(BytesRef payload) {
this.payload = payload;
}
-
+
@Override
public void clear() {
payload = null;
}
@Override
- public PayloadAttributeImpl clone() {
+ public PayloadAttributeImpl clone() {
PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone();
if (payload != null) {
clone.payload = BytesRef.deepCopyOf(payload);
@@ -66,16 +61,16 @@ public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttrib
if (other == this) {
return true;
}
-
+
if (other instanceof PayloadAttribute) {
PayloadAttributeImpl o = (PayloadAttributeImpl) other;
if (o.payload == null || payload == null) {
return o.payload == null && payload == null;
}
-
+
return o.payload.equals(payload);
}
-
+
return false;
}
@@ -88,7 +83,7 @@ public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttrib
public void copyTo(AttributeImpl target) {
PayloadAttribute t = (PayloadAttribute) target;
t.setPayload((payload == null) ? null : BytesRef.deepCopyOf(payload));
- }
+ }
@Override
public void reflectWith(AttributeReflector reflector) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
index 2e73713..90c571e 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
@@ -16,46 +16,44 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
-/** Determines the position of this token
- * relative to the previous Token in a TokenStream, used in phrase
- * searching.
+/**
+ * Determines the position of this token relative to the previous Token in a TokenStream, used in
+ * phrase searching.
*
* <p>The default value is one.
*
- * <p>Some common uses for this are:<ul>
- *
- * <li>Set it to zero to put multiple terms in the same position. This is
- * useful if, e.g., a word has multiple stems. Searches for phrases
- * including either stem will match. In this case, all but the first stem's
- * increment should be set to zero: the increment of the first instance
- * should be one. Repeating a token with an increment of zero can also be
- * used to boost the scores of matches on that token.
- *
- * <li>Set it to values greater than one to inhibit exact phrase matches.
- * If, for example, one does not want phrases to match across removed stop
- * words, then one could build a stop word filter that removes stop words and
- * also sets the increment to the number of stop words removed before each
- * non-stop word. Then exact phrase queries will only match when the terms
- * occur with no intervening stop words.
+ * <p>Some common uses for this are:
*
+ * <ul>
+ * <li>Set it to zero to put multiple terms in the same position. This is useful if, e.g., a word
+ * has multiple stems. Searches for phrases including either stem will match. In this case,
+ * all but the first stem's increment should be set to zero: the increment of the first
+ * instance should be one. Repeating a token with an increment of zero can also be used to
+ * boost the scores of matches on that token.
+ * <li>Set it to values greater than one to inhibit exact phrase matches. If, for example, one
+ * does not want phrases to match across removed stop words, then one could build a stop word
+ * filter that removes stop words and also sets the increment to the number of stop words
+ * removed before each non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
* </ul>
- *
+ *
* @see org.apache.lucene.index.PostingsEnum
*/
public interface PositionIncrementAttribute extends Attribute {
- /** Set the position increment. The default value is one.
+ /**
+ * Set the position increment. The default value is one.
*
* @param positionIncrement the distance from the prior term
- * @throws IllegalArgumentException if <code>positionIncrement</code>
- * is negative.
+ * @throws IllegalArgumentException if <code>positionIncrement</code> is negative.
* @see #getPositionIncrement()
*/
public void setPositionIncrement(int positionIncrement);
- /** Returns the position increment of this Token.
+ /**
+ * Returns the position increment of this Token.
+ *
* @see #setPositionIncrement(int)
*/
public int getPositionIncrement();
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
index e89fec1..80ef0d4 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
@@ -16,21 +16,22 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link PositionIncrementAttribute}. */
-public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable {
+public class PositionIncrementAttributeImpl extends AttributeImpl
+ implements PositionIncrementAttribute, Cloneable {
private int positionIncrement = 1;
-
+
/** Initialize this attribute with position increment of 1 */
public PositionIncrementAttributeImpl() {}
@Override
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0) {
- throw new IllegalArgumentException("Position increment must be zero or greater; got " + positionIncrement);
+ throw new IllegalArgumentException(
+ "Position increment must be zero or greater; got " + positionIncrement);
}
this.positionIncrement = positionIncrement;
}
@@ -44,7 +45,7 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
public void clear() {
this.positionIncrement = 1;
}
-
+
@Override
public void end() {
this.positionIncrement = 0;
@@ -55,12 +56,12 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
if (other == this) {
return true;
}
-
+
if (other instanceof PositionIncrementAttributeImpl) {
PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
- return positionIncrement == _other.positionIncrement;
+ return positionIncrement == _other.positionIncrement;
}
-
+
return false;
}
@@ -68,12 +69,12 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
public int hashCode() {
return positionIncrement;
}
-
+
@Override
public void copyTo(AttributeImpl target) {
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
- }
+ }
@Override
public void reflectWith(AttributeReflector reflector) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java
index 210eb0f..ab4996a5 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java
@@ -16,35 +16,31 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
-/** Determines how many positions this
- * token spans. Very few analyzer components actually
- * produce this attribute, and indexing ignores it, but
- * it's useful to express the graph structure naturally
- * produced by decompounding, word splitting/joining,
- * synonym filtering, etc.
+/**
+ * Determines how many positions this token spans. Very few analyzer components actually produce
+ * this attribute, and indexing ignores it, but it's useful to express the graph structure naturally
+ * produced by decompounding, word splitting/joining, synonym filtering, etc.
*
- * <p>NOTE: this is optional, and most analyzers
- * don't change the default value (1). */
-
+ * <p>NOTE: this is optional, and most analyzers don't change the default value (1).
+ */
public interface PositionLengthAttribute extends Attribute {
/**
* Set the position length of this Token.
- * <p>
- * The default value is one.
- * @param positionLength how many positions this token
- * spans.
- * @throws IllegalArgumentException if <code>positionLength</code>
- * is zero or negative.
+ *
+ * <p>The default value is one.
+ *
+ * @param positionLength how many positions this token spans.
+ * @throws IllegalArgumentException if <code>positionLength</code> is zero or negative.
* @see #getPositionLength()
*/
public void setPositionLength(int positionLength);
- /** Returns the position length of this Token.
+ /**
+ * Returns the position length of this Token.
+ *
* @see #setPositionLength
*/
public int getPositionLength();
}
-
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
index d019a2b..3a59faa 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
@@ -16,21 +16,22 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link PositionLengthAttribute}. */
-public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
+public class PositionLengthAttributeImpl extends AttributeImpl
+ implements PositionLengthAttribute, Cloneable {
private int positionLength = 1;
-
+
/** Initializes this attribute with position length of 1. */
public PositionLengthAttributeImpl() {}
-
+
@Override
public void setPositionLength(int positionLength) {
if (positionLength < 1) {
- throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
+ throw new IllegalArgumentException(
+ "Position length must be 1 or greater; got " + positionLength);
}
this.positionLength = positionLength;
}
@@ -44,18 +45,18 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
public void clear() {
this.positionLength = 1;
}
-
+
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
-
+
if (other instanceof PositionLengthAttributeImpl) {
PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
- return positionLength == _other.positionLength;
+ return positionLength == _other.positionLength;
}
-
+
return false;
}
@@ -63,12 +64,12 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
public int hashCode() {
return positionLength;
}
-
+
@Override
public void copyTo(AttributeImpl target) {
PositionLengthAttribute t = (PositionLengthAttribute) target;
t.setPositionLength(positionLength);
- }
+ }
@Override
public void reflectWith(AttributeReflector reflector) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java
index 0f33aef..bad431c 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttribute.java
@@ -20,9 +20,11 @@ package org.apache.lucene.analysis.tokenattributes;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.Attribute;
-/** Sets the custom term frequency of a term within one document. If this attribute
- * is present in your analysis chain for a given field, that field must be indexed with
- * {@link IndexOptions#DOCS_AND_FREQS}. */
+/**
+ * Sets the custom term frequency of a term within one document. If this attribute is present in
+ * your analysis chain for a given field, that field must be indexed with {@link
+ * IndexOptions#DOCS_AND_FREQS}.
+ */
public interface TermFrequencyAttribute extends Attribute {
/** Set the custom term frequency of the current term within one document. */
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java
index c214453..4e8520f 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermFrequencyAttributeImpl.java
@@ -16,21 +16,22 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link TermFrequencyAttribute}. */
-public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFrequencyAttribute, Cloneable {
+public class TermFrequencyAttributeImpl extends AttributeImpl
+ implements TermFrequencyAttribute, Cloneable {
private int termFrequency = 1;
-
+
/** Initialize this attribute with term frequency of 1 */
public TermFrequencyAttributeImpl() {}
@Override
public void setTermFrequency(int termFrequency) {
if (termFrequency < 1) {
- throw new IllegalArgumentException("Term frequency must be 1 or greater; got " + termFrequency);
+ throw new IllegalArgumentException(
+ "Term frequency must be 1 or greater; got " + termFrequency);
}
this.termFrequency = termFrequency;
}
@@ -44,7 +45,7 @@ public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFre
public void clear() {
this.termFrequency = 1;
}
-
+
@Override
public void end() {
this.termFrequency = 1;
@@ -55,12 +56,12 @@ public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFre
if (other == this) {
return true;
}
-
+
if (other instanceof TermFrequencyAttributeImpl) {
TermFrequencyAttributeImpl _other = (TermFrequencyAttributeImpl) other;
- return termFrequency == _other.termFrequency;
+ return termFrequency == _other.termFrequency;
}
-
+
return false;
}
@@ -68,12 +69,12 @@ public class TermFrequencyAttributeImpl extends AttributeImpl implements TermFre
public int hashCode() {
return Integer.hashCode(termFrequency);
}
-
+
@Override
public void copyTo(AttributeImpl target) {
TermFrequencyAttribute t = (TermFrequencyAttribute) target;
t.setTermFrequency(termFrequency);
- }
+ }
@Override
public void reflectWith(AttributeReflector reflector) {
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
index 7fe5e25..19b0b9f 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
@@ -16,15 +16,15 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.BytesRef;
/**
- * This attribute is requested by TermsHashPerField to index the contents.
- * This attribute can be used to customize the final byte[] encoding of terms.
- * <p>
- * Consumers of this attribute call {@link #getBytesRef()} for each term. Example:
+ * This attribute is requested by TermsHashPerField to index the contents. This attribute can be
+ * used to customize the final byte[] encoding of terms.
+ *
+ * <p>Consumers of this attribute call {@link #getBytesRef()} for each term. Example:
+ *
* <pre class="prettyprint">
* final TermToBytesRefAttribute termAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
*
@@ -32,7 +32,7 @@ import org.apache.lucene.util.BytesRef;
* final BytesRef bytes = termAtt.getBytesRef();
*
* if (isInteresting(bytes)) {
- *
+ *
* // because the bytes are reused by the attribute (like CharTermAttribute's char[] buffer),
* // you should make a copy if you need persistent access to the bytes, otherwise they will
* // be rewritten across calls to incrementToken()
@@ -42,15 +42,17 @@ import org.apache.lucene.util.BytesRef;
* }
* ...
* </pre>
- * @lucene.internal This is a very expert and internal API, please use
- * {@link CharTermAttribute} and its implementation for UTF-8 terms; to
- * index binary terms, use {@link BytesTermAttribute} and its implementation.
+ *
+ * @lucene.internal This is a very expert and internal API, please use {@link CharTermAttribute} and
+ * its implementation for UTF-8 terms; to index binary terms, use {@link BytesTermAttribute} and
+ * its implementation.
*/
public interface TermToBytesRefAttribute extends Attribute {
-
+
/**
- * Retrieve this attribute's BytesRef. The bytes are updated from the current term.
- * The implementation may return a new instance or keep the previous one.
+ * Retrieve this attribute's BytesRef. The bytes are updated from the current term. The
+ * implementation may return a new instance or keep the previous one.
+ *
* @return a BytesRef to be indexed (only stays valid until token stream gets incremented)
*/
public BytesRef getBytesRef();
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
index 74ceb72..89af7b3 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
@@ -16,26 +16,25 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.Attribute;
-/**
- * A Token's lexical type. The Default value is "word".
- */
+/** A Token's lexical type. The Default value is "word". */
public interface TypeAttribute extends Attribute {
/** the default type */
public static final String DEFAULT_TYPE = "word";
- /**
- * Returns this Token's lexical type. Defaults to "word".
+ /**
+ * Returns this Token's lexical type. Defaults to "word".
+ *
* @see #setType(String)
*/
public String type();
- /**
+ /**
* Set the lexical type.
- * @see #type()
+ *
+ * @see #type()
*/
public void setType(String type);
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java
index 7486b69..dbe5e94 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java
@@ -16,24 +16,23 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link TypeAttribute}. */
public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable {
private String type;
-
+
/** Initialize this attribute with {@link TypeAttribute#DEFAULT_TYPE} */
public TypeAttributeImpl() {
- this(DEFAULT_TYPE);
+ this(DEFAULT_TYPE);
}
-
+
/** Initialize this attribute with <code>type</code> */
public TypeAttributeImpl(String type) {
this.type = type;
}
-
+
@Override
public String type() {
return type;
@@ -46,7 +45,7 @@ public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, C
@Override
public void clear() {
- type = DEFAULT_TYPE;
+ type = DEFAULT_TYPE;
}
@Override
@@ -54,12 +53,12 @@ public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, C
if (other == this) {
return true;
}
-
+
if (other instanceof TypeAttributeImpl) {
final TypeAttributeImpl o = (TypeAttributeImpl) other;
return (this.type == null ? o.type == null : this.type.equals(o.type));
}
-
+
return false;
}
@@ -67,7 +66,7 @@ public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, C
public int hashCode() {
return (type == null) ? 0 : type.hashCode();
}
-
+
@Override
public void copyTo(AttributeImpl target) {
TypeAttribute t = (TypeAttribute) target;
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/package-info.java
index 7ad0029..a70ffcf 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * General-purpose attributes for text analysis.
- */
+/** General-purpose attributes for text analysis. */
package org.apache.lucene.analysis.tokenattributes;
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 6242095..6d79dc5 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.standard;
-
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@@ -24,7 +23,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockGraphTokenFilter;
@@ -36,45 +34,51 @@ import org.apache.lucene.util.TestUtil;
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
- // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
+ // LUCENE-5897: slow tokenization of strings of the form
+ // (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
@Slow
public void testLargePartiallyMatchingToken() throws Exception {
// TODO: get these lists of chars matching a property from ICU4J
// http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
- char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
+ char[] WordBreak_ExtendNumLet_chars =
+ "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
// http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
- int[] WordBreak_Format_chars // only the first char in ranges
- = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
- 0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
+ int[] WordBreak_Format_chars // only the first char in ranges
+ = {
+ 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF, 0xFFF9,
+ 0x110BD, 0x1D173, 0xE0001, 0xE0020
+ };
// http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
int[] WordBreak_Extend_chars // only the first char in ranges
- = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
- 0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
- 0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
- 0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
- 0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
- 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
- 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
- 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
- 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
- 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
- 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
- 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
- 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
- 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
- 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
- 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
- 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
- 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
- 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
- 0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
-
+ = {
+ 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df, 0x6e7,
+ 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4, 0x900, 0x93a,
+ 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2, 0xa01, 0xa3c, 0xa3e,
+ 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7, 0xacb, 0xae2, 0xb01, 0xb3c,
+ 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6, 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46,
+ 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6, 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46,
+ 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf, 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1,
+ 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35, 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6,
+ 0x102b, 0x1056, 0x105e, 0x1062, 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712,
+ 0x1732, 0x1752, 0x1772, 0x17b4, 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8,
+ 0x1a17, 0x1a55, 0x1a60, 0x1a7f, 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24,
+ 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2, 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0,
+ 0x302a, 0x3099, 0xa66f, 0xa674, 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880,
+ 0xa8b4, 0xa8e0, 0xa926, 0xa947, 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0,
+ 0xaab2, 0xaab7, 0xaabe, 0xaac1, 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20,
+ 0xff9e, 0x101fd, 0x10a01, 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038,
+ 0x11080, 0x11082, 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180,
+ 0x11182, 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
+ 0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100
+ };
+
StringBuilder builder = new StringBuilder();
int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
- for (int i = 0 ; i < numChars ; ) {
- builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
+ for (int i = 0; i < numChars; ) {
+ builder.append(
+ WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
++i;
if (random().nextBoolean()) {
int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
@@ -94,7 +98,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
StandardTokenizer ts = new StandardTokenizer();
ts.setReader(new StringReader(builder.toString()));
ts.reset();
- while (ts.incrementToken()) { }
+ while (ts.incrementToken()) {}
ts.end();
ts.close();
@@ -102,11 +106,11 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
ts.setMaxTokenLength(newBufferSize); // try a different buffer size
ts.setReader(new StringReader(builder.toString()));
ts.reset();
- while (ts.incrementToken()) { }
+ while (ts.incrementToken()) {}
ts.end();
ts.close();
}
-
+
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
@@ -116,23 +120,24 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
String input = sb.toString();
StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(input));
- BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+ BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] {"testing", "1234"});
}
private Analyzer a;
-
+
@Override
public void setUp() throws Exception {
super.setUp();
- a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
- return new TokenStreamComponents(tokenizer);
- }
- };
+ a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
}
-
+
@Override
public void tearDown() throws Exception {
a.close();
@@ -140,328 +145,528 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testArmenian() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
- new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
- "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
+ new String[] {
+ "Վիքիպեդիայի",
+ "13",
+ "միլիոն",
+ "հոդվածները",
+ "4,600",
+ "հայերեն",
+ "վիքիպեդիայում",
+ "գրվել",
+ "են",
+ "կամավորների",
+ "կողմից",
+ "ու",
+ "համարյա",
+ "բոլոր",
+ "հոդվածները",
+ "կարող",
+ "է",
+ "խմբագրել",
+ "ցանկաց",
+ "մարդ",
+ "ով",
+ "կարող",
+ "է",
+ "բացել",
+ "Վիքիպեդիայի",
+ "կայքը"
+ });
}
-
+
public void testAmharic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
- new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
+ new String[] {
+ "ዊኪፔድያ",
+ "የባለ",
+ "ብዙ",
+ "ቋንቋ",
+ "የተሟላ",
+ "ትክክለኛና",
+ "ነጻ",
+ "መዝገበ",
+ "ዕውቀት",
+ "ኢንሳይክሎፒዲያ",
+ "ነው",
+ "ማንኛውም"
+ });
}
-
+
public void testArabic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
- new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
- "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
+ new String[] {
+ "الفيلم",
+ "الوثائقي",
+ "الأول",
+ "عن",
+ "ويكيبيديا",
+ "يسمى",
+ "الحقيقة",
+ "بالأرقام",
+ "قصة",
+ "ويكيبيديا",
+ "بالإنجليزية",
+ "Truth",
+ "in",
+ "Numbers",
+ "The",
+ "Wikipedia",
+ "Story",
+ "سيتم",
+ "إطلاقه",
+ "في",
+ "2008"
+ });
}
-
+
public void testAramaic() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
- new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
- "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+ new String[] {
+ "ܘܝܩܝܦܕܝܐ",
+ "ܐܢܓܠܝܐ",
+ "Wikipedia",
+ "ܗܘ",
+ "ܐܝܢܣܩܠܘܦܕܝܐ",
+ "ܚܐܪܬܐ",
+ "ܕܐܢܛܪܢܛ",
+ "ܒܠܫܢ̈ܐ",
+ "ܣܓܝܐ̈ܐ",
+ "ܫܡܗ",
+ "ܐܬܐ",
+ "ܡܢ",
+ "ܡ̈ܠܬܐ",
+ "ܕ",
+ "ܘܝܩܝ",
+ "ܘ",
+ "ܐܝܢܣܩܠܘܦܕܝܐ"
+ });
}
-
+
public void testBengali() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
- new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
- "শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
+ new String[] {
+ "এই",
+ "বিশ্বকোষ",
+ "পরিচালনা",
+ "করে",
+ "উইকিমিডিয়া",
+ "ফাউন্ডেশন",
+ "একটি",
+ "অলাভজনক",
+ "সংস্থা",
+ "উইকিপিডিয়ার",
+ "শুরু",
+ "১৫",
+ "জানুয়ারি",
+ "২০০১",
+ "সালে",
+ "এখন",
+ "পর্যন্ত",
+ "২০০টিরও",
+ "বেশী",
+ "ভাষায়",
+ "উইকিপিডিয়া",
+ "রয়েছে"
+ });
}
-
+
public void testFarsi() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
- new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
- "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
+ new String[] {
+ "ویکی",
+ "پدیای",
+ "انگلیسی",
+ "در",
+ "تاریخ",
+ "۲۵",
+ "دی",
+ "۱۳۷۹",
+ "به",
+ "صورت",
+ "مکملی",
+ "برای",
+ "دانشنامهٔ",
+ "تخصصی",
+ "نوپدیا",
+ "نوشته",
+ "شد"
+ });
}
-
+
public void testGreek() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
- new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
- "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
+ new String[] {
+ "Γράφεται",
+ "σε",
+ "συνεργασία",
+ "από",
+ "εθελοντές",
+ "με",
+ "το",
+ "λογισμικό",
+ "wiki",
+ "κάτι",
+ "που",
+ "σημαίνει",
+ "ότι",
+ "άρθρα",
+ "μπορεί",
+ "να",
+ "προστεθούν",
+ "ή",
+ "να",
+ "αλλάξουν",
+ "από",
+ "τον",
+ "καθένα"
+ });
}
public void testThai() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
- new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
+ new String[] {"การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔"});
}
-
+
public void testLao() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
- new String[] { "ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
+ new String[] {"ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ"});
}
-
+
public void testTibetan() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
- new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
- "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
- "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+ new String[] {
+ "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
+ "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
+ "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ"
+ });
}
-
+
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
*/
public void testChinese() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
- new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
}
-
+
public void testEmpty() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
}
-
+
/* test various jira issues this analyzer is related to */
-
+
public void testLUCENE1545() throws Exception {
/*
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
* Expected result is only on token "moͤchte".
*/
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] {"moͤchte"});
}
-
+
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[] {"B2B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[] {"2B"});
}
public void testDelimitersSA() throws Exception {
// other delimiters: "-", "/", ","
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "some-dashed-phrase", new String[] {"some", "dashed", "phrase"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "dogs,chase,cats", new String[] {"dogs", "chase", "cats"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[] {"ac", "dc"});
}
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[] {"O'Reilly"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[] {"you're"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[] {"she's"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[] {"Jim's"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[] {"don't"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[] {"O'Reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[] {"21.35"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[] {"R2D2", "C3PO"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[] {"216.239.63.104"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[] {"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
// numbers
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "David has 5000 bones", new String[] {"David", "has", "5000", "bones"});
}
public void testVariousTextSA() throws Exception {
// various
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "C embedded developers wanted", new String[] {"C", "embedded", "developers", "wanted"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "foo bar FOO BAR", new String[] {"foo", "bar", "FOO", "BAR"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "foo bar . FOO <> BAR", new String[] {"foo", "bar", "FOO", "BAR"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[] {"QUOTED", "word"});
}
public void testKoreanSA() throws Exception {
// Korean words
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[] {"안녕하세요", "한글입니다"});
}
-
+
public void testOffsets() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
-
+
public void testTypes() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones",
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
- new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
+ new String[] {"<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>"});
}
-
+
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
-
+
public void testSupplementary() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "𩬅艱鍟䇹愯瀛",
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "𩬅艱鍟䇹愯瀛",
new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
- new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ new String[] {
+ "<IDEOGRAPHIC>",
+ "<IDEOGRAPHIC>",
+ "<IDEOGRAPHIC>",
+ "<IDEOGRAPHIC>",
+ "<IDEOGRAPHIC>",
+ "<IDEOGRAPHIC>"
+ });
}
-
+
public void testKorean() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
- new String[] { "훈민정음" },
- new String[] { "<HANGUL>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "훈민정음", new String[] {"훈민정음"}, new String[] {"<HANGUL>"});
}
-
+
public void testJapanese() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
- new String[] { "仮", "名", "遣", "い", "カタカナ" },
- new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "仮名遣い カタカナ",
+ new String[] {"仮", "名", "遣", "い", "カタカナ"},
+ new String[] {
+ "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>"
+ });
}
-
+
public void testCombiningMarks() throws Exception {
checkOneTerm(a, "ざ", "ざ"); // hiragana
checkOneTerm(a, "ザ", "ザ"); // katakana
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
- checkOneTerm(a, "아゙", "아゙"); // hangul
+ checkOneTerm(a, "아゙", "아゙"); // hangul
}
/**
- * Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
- * and/or \p{MidNum} should trigger a token split.
+ * Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet}, and/or \p{MidNum} should
+ * trigger a token split.
*/
public void testMid() throws Exception {
- // ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
-
- // '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
-
- // ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
+ // ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on
+ // both sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] {"A:B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] {"A", "B"});
+
+ // '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric
+ // char on both sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] {"1.2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] {"A.B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] {"1", "2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] {"A", "B"});
+
+ // ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both
+ // sides
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] {"1,2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] {"1", "2"});
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] {"A", "B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] {"A", "B"});
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] {"1", "2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] {"1", "2"});
// '_' is in \p{WB:ExtendNumLet}
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A:B", new String[] { "A:B_A:B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A::B", new String[] { "A:B_A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A:B", new String[] {"A:B_A:B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A::B", new String[] {"A:B_A", "B"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1.2", new String[] { "1.2_1.2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A.B", new String[] { "A.B_A.B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1..2", new String[] { "1.2_1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A..B", new String[] { "A.B_A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1.2", new String[] {"1.2_1.2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A.B", new String[] {"A.B_A.B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1..2", new String[] {"1.2_1", "2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A..B", new String[] {"A.B_A", "B"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,2", new String[] { "1,2_1,2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,,2", new String[] { "1,2_1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,2", new String[] {"1,2_1,2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,,2", new String[] {"1,2_1", "2"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A.:B", new String[] { "C_A", "B" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A:.B", new String[] { "C_A", "B" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A.:B", new String[] {"C_A", "B"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A:.B", new String[] {"C_A", "B"});
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1,.2", new String[] { "3_1", "2" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1,.2", new String[] {"3_1", "2"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] {"3_1", "2"});
}
/** simple emoji */
public void testEmoji() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
- new String[] { "💩", "💩", "💩" },
- new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "💩 💩💩",
+ new String[] {"💩", "💩", "💩"},
+ new String[] {"<EMOJI>", "<EMOJI>", "<EMOJI>"});
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
- new String[] { "👩❤️👩" },
- new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "👩❤️👩", new String[] {"👩❤️👩"}, new String[] {"<EMOJI>"});
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
- new String[] { "👨🏼⚕️" },
- new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "👨🏼⚕️", new String[] {"👨🏼⚕️"}, new String[] {"<EMOJI>"});
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
- new String[] { "🇺🇸", "🇺🇸" },
- new String[] { "<EMOJI>", "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "🇺🇸🇺🇸", new String[] {"🇺🇸", "🇺🇸"}, new String[] {"<EMOJI>", "<EMOJI>"});
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
- new String[] { "#️⃣" },
- new String[] { "<EMOJI>" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
- new String[] { "3️⃣",},
- new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "#️⃣", new String[] {"#️⃣"}, new String[] {"<EMOJI>"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "3️⃣",
+ new String[] {
+ "3️⃣",
+ },
+ new String[] {"<EMOJI>"});
// text presentation sequences
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
- new String[] { },
- new String[] { });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
- new String[] { "3\uFE0E",},
- new String[] { "<NUM>" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
- new String[] { "\u2B55",},
- new String[] { "<EMOJI>" });
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
- new String[] { "\u2B55", "\u200D\u2B55"},
- new String[] { "<EMOJI>", "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E", new String[] {}, new String[] {});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
+ new String[] {
+ "3\uFE0E",
+ },
+ new String[] {"<NUM>"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
+ new String[] {
+ "\u2B55",
+ },
+ new String[] {"<EMOJI>"});
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+ new String[] {"\u2B55", "\u200D\u2B55"},
+ new String[] {"<EMOJI>", "<EMOJI>"});
}
public void testEmojiTagSequence() throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
- new String[] { "🏴" },
- new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a, "🏴", new String[] {"🏴"}, new String[] {"<EMOJI>"});
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
- new String[] { "poo", "💩", "poo" },
- new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "poo💩poo",
+ new String[] {"poo", "💩", "poo"},
+ new String[] {"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"});
// simple emoji around non-latin
- BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
- new String[] { "💩", "中", "國", "💩" },
- new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(
+ a,
+ "💩中國💩",
+ new String[] {"💩", "中", "國", "💩"},
+ new String[] {"<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>"});
}
-
+
public void testUnicodeEmojiTests() throws Exception {
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
emojiTest.test(a);
}
-
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
-
+
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
- checkRandomData(random(), analyzer, 20*RANDOM_MULTIPLIER, 8192);
+ checkRandomData(random(), analyzer, 20 * RANDOM_MULTIPLIER, 8192);
analyzer.close();
}
// Adds random graph after:
public void testRandomHugeStringsGraphAfter() throws Exception {
Random random = random();
- Analyzer analyzer = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
- TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
- return new TokenStreamComponents(tokenizer, tokenStream);
- }
- };
- checkRandomData(random, analyzer, 20*RANDOM_MULTIPLIER, 8192);
+ Analyzer analyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
+ TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
+ return new TokenStreamComponents(tokenizer, tokenStream);
+ }
+ };
+ checkRandomData(random, analyzer, 20 * RANDOM_MULTIPLIER, 8192);
analyzer.close();
}
@@ -475,7 +680,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
StringBuilder bToken = new StringBuilder();
// exact max length:
- for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+ for (int i = 0; i < StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; i++) {
bToken.append('b');
}
@@ -489,13 +694,13 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
public void testMaxTokenLengthNonDefault() throws Exception {
StandardAnalyzer a = new StandardAnalyzer();
a.setMaxTokenLength(5);
- assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+ assertAnalyzesTo(a, "ab cd toolong xy z", new String[] {"ab", "cd", "toolo", "ng", "xy", "z"});
a.close();
}
public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
-
+
// Collect tokens with normal reader
StandardAnalyzer a = new StandardAnalyzer();
TokenStream ts = a.tokenStream("dummy", text);
@@ -509,12 +714,13 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
ts.close();
// Tokens from a spoon-feed reader should be the same as from a normal reader
- // The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
+ // The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the
+ // surrogate pair at a read boundary
Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
ts = a.tokenStream("dummy", reader);
termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
- for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
+ for (int tokenNum = 0; ts.incrementToken(); ++tokenNum) {
assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
}
ts.end();
@@ -524,7 +730,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
class SpoonFeedMaxCharsReaderWrapper extends Reader {
private final Reader in;
- private final int maxChars;
+ private final int maxChars;
public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
this.in = in;
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
index 758d503..2efd046 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
@@ -16,48 +16,44 @@
*/
package org.apache.lucene.analysis.standard;
-
-import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
-import org.apache.lucene.analysis.Tokenizer;
-
import java.io.Reader;
import java.io.StringReader;
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.Tokenizer;
-/**
- * Simple tests to ensure the standard lucene factories are working.
- */
+/** Simple tests to ensure the standard lucene factories are working. */
public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
- /**
- * Test StandardTokenizerFactory
- */
+ /** Test StandardTokenizerFactory */
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
Tokenizer stream = tokenizerFactory("Standard").create(newAttributeFactory());
stream.setReader(reader);
- assertTokenStreamContents(stream,
- new String[]{"Wha\u0301t's", "this", "thing", "do"});
+ assertTokenStreamContents(stream, new String[] {"Wha\u0301t's", "this", "thing", "do"});
}
-
+
public void testStandardTokenizerMaxTokenLength() throws Exception {
StringBuilder builder = new StringBuilder();
- for (int i = 0 ; i < 100 ; ++i) {
+ for (int i = 0; i < 100; ++i) {
builder.append("abcdefg"); // 7 * 100 = 700 char "word"
}
String longWord = builder.toString();
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
- Tokenizer stream = tokenizerFactory("Standard",
- "maxTokenLength", "1000").create(newAttributeFactory());
+ Tokenizer stream =
+ tokenizerFactory("Standard", "maxTokenLength", "1000").create(newAttributeFactory());
stream.setReader(reader);
- assertTokenStreamContents(stream,
- new String[]{"one", "two", "three", longWord, "four", "five", "six"});
+ assertTokenStreamContents(
+ stream, new String[] {"one", "two", "three", longWord, "four", "five", "six"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
- IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
- tokenizerFactory("Standard", "bogusArg", "bogusValue");
- });
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ tokenizerFactory("Standard", "bogusArg", "bogusValue");
+ });
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestBytesRefAttImpl.java b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestBytesRefAttImpl.java
index 6c01d5c..54f23e7 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestBytesRefAttImpl.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestBytesRefAttImpl.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java
index bbcf10d..53af58b 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java
@@ -16,17 +16,15 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.TestUtil;
-
import java.nio.CharBuffer;
-import java.util.HashMap;
import java.util.Formatter;
+import java.util.HashMap;
import java.util.Locale;
import java.util.regex.Pattern;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
public class TestCharTermAttributeImpl extends LuceneTestCase {
@@ -34,8 +32,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
CharTermAttributeImpl t = new CharTermAttributeImpl();
char[] content = "hello".toCharArray();
t.copyBuffer(content, 0, content.length);
- for (int i = 0; i < 2000; i++)
- {
+ for (int i = 0; i < 2000; i++) {
t.resizeBuffer(i);
assertTrue(i <= t.buffer().length);
assertEquals("hello", t.toString());
@@ -46,17 +43,18 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
CharTermAttributeImpl t = new CharTermAttributeImpl();
char[] content = "hello".toCharArray();
t.copyBuffer(content, 0, content.length);
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.setLength(-1);
- });
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.setLength(-1);
+ });
}
@Slow
public void testGrow() {
CharTermAttributeImpl t = new CharTermAttributeImpl();
StringBuilder buf = new StringBuilder("ab");
- for (int i = 0; i < 20; i++)
- {
+ for (int i = 0; i < 20; i++) {
char[] content = buf.toString().toCharArray();
t.copyBuffer(content, 0, content.length);
assertEquals(buf.length(), t.length());
@@ -68,8 +66,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
// now as a StringBuilder, first variant
t = new CharTermAttributeImpl();
buf = new StringBuilder("ab");
- for (int i = 0; i < 20; i++)
- {
+ for (int i = 0; i < 20; i++) {
t.setEmpty().append(buf);
assertEquals(buf.length(), t.length());
assertEquals(buf.toString(), t.toString());
@@ -80,8 +77,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
// Test for slow growth to a long term
t = new CharTermAttributeImpl();
buf = new StringBuilder("a");
- for (int i = 0; i < 20000; i++)
- {
+ for (int i = 0; i < 20000; i++) {
t.setEmpty().append(buf);
assertEquals(buf.length(), t.length());
assertEquals(buf.toString(), t.toString());
@@ -109,7 +105,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertEquals(t.toString(), copy.toString());
assertNotSame(buf, copy.buffer());
}
-
+
public void testEquals() throws Exception {
CharTermAttributeImpl t1a = new CharTermAttributeImpl();
char[] content1a = "hello".toCharArray();
@@ -124,7 +120,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertFalse(t1a.equals(t2));
assertFalse(t2.equals(t1b));
}
-
+
public void testCopyTo() throws Exception {
CharTermAttributeImpl t = new CharTermAttributeImpl();
CharTermAttributeImpl copy = assertCopyIsEqual(t);
@@ -139,30 +135,34 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertEquals(t.toString(), copy.toString());
assertNotSame(buf, copy.buffer());
}
-
+
public void testAttributeReflection() throws Exception {
CharTermAttributeImpl t = new CharTermAttributeImpl();
t.append("foobar");
- TestUtil.assertAttributeReflection(t, new HashMap<String, Object>() {{
- put(CharTermAttribute.class.getName() + "#term", "foobar");
- put(TermToBytesRefAttribute.class.getName() + "#bytes", new BytesRef("foobar"));
- }});
+ TestUtil.assertAttributeReflection(
+ t,
+ new HashMap<String, Object>() {
+ {
+ put(CharTermAttribute.class.getName() + "#term", "foobar");
+ put(TermToBytesRefAttribute.class.getName() + "#bytes", new BytesRef("foobar"));
+ }
+ });
}
-
+
public void testCharSequenceInterface() {
- final String s = "0123456789";
+ final String s = "0123456789";
final CharTermAttributeImpl t = new CharTermAttributeImpl();
t.append(s);
-
+
assertEquals(s.length(), t.length());
- assertEquals("12", t.subSequence(1,3).toString());
- assertEquals(s, t.subSequence(0,s.length()).toString());
-
+ assertEquals("12", t.subSequence(1, 3).toString());
+ assertEquals(s, t.subSequence(0, s.length()).toString());
+
assertTrue(Pattern.matches("01\\d+", t));
- assertTrue(Pattern.matches("34", t.subSequence(3,5)));
-
- assertEquals(s.subSequence(3,7).toString(), t.subSequence(3,7).toString());
-
+ assertTrue(Pattern.matches("34", t.subSequence(3, 5)));
+
+ assertEquals(s.subSequence(3, 7).toString(), t.subSequence(3, 7).toString());
+
for (int i = 0; i < s.length(); i++) {
assertTrue(t.charAt(i) == s.charAt(i));
}
@@ -200,24 +200,34 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertEquals("4test", t.toString());
t.append((CharSequence) t2, 1, 2);
assertEquals("4teste", t.toString());
-
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.append((CharSequence) t2, 1, 5);
- });
-
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.append((CharSequence) t2, 1, 0);
- });
-
+
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.append((CharSequence) t2, 1, 5);
+ });
+
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.append((CharSequence) t2, 1, 0);
+ });
+
t.append((CharSequence) null);
assertEquals("4testenull", t.toString());
}
-
+
public void testAppendableInterfaceWithLongSequences() {
CharTermAttributeImpl t = new CharTermAttributeImpl();
t.append((CharSequence) "01234567890123456789012345678901234567890123456789");
- t.append((CharSequence) CharBuffer.wrap("01234567890123456789012345678901234567890123456789".toCharArray()), 3, 50);
- assertEquals("0123456789012345678901234567890123456789012345678934567890123456789012345678901234567890123456789", t.toString());
+ t.append(
+ (CharSequence)
+ CharBuffer.wrap("01234567890123456789012345678901234567890123456789".toCharArray()),
+ 3,
+ 50);
+ assertEquals(
+ "0123456789012345678901234567890123456789012345678934567890123456789012345678901234567890123456789",
+ t.toString());
t.setEmpty().append((CharSequence) new StringBuilder("01234567890123456789"), 5, 17);
assertEquals((CharSequence) "567890123456", t.toString());
t.append(new StringBuffer(t));
@@ -227,22 +237,34 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertEquals("345678901234567", buf.toString());
t.setEmpty().append(buf, 1, 14);
assertEquals("4567890123456", t.toString());
-
+
// finally use a completely custom CharSequence that is not catched by instanceof checks
final String longTestString = "012345678901234567890123456789";
- t.append(new CharSequence() {
- @Override
- public char charAt(int i) { return longTestString.charAt(i); }
- @Override
- public int length() { return longTestString.length(); }
- @Override
- public CharSequence subSequence(int start, int end) { return longTestString.subSequence(start, end); }
- @Override
- public String toString() { return longTestString; }
- });
- assertEquals("4567890123456"+longTestString, t.toString());
+ t.append(
+ new CharSequence() {
+ @Override
+ public char charAt(int i) {
+ return longTestString.charAt(i);
+ }
+
+ @Override
+ public int length() {
+ return longTestString.length();
+ }
+
+ @Override
+ public CharSequence subSequence(int start, int end) {
+ return longTestString.subSequence(start, end);
+ }
+
+ @Override
+ public String toString() {
+ return longTestString;
+ }
+ });
+ assertEquals("4567890123456" + longTestString, t.toString());
}
-
+
public void testNonCharSequenceAppend() {
CharTermAttributeImpl t = new CharTermAttributeImpl();
t.append("0123456789");
@@ -259,27 +281,35 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
t.append((CharTermAttribute) null);
assertEquals("012345678901234567890123456789testnullnullnull", t.toString());
}
-
+
public void testExceptions() {
CharTermAttributeImpl t = new CharTermAttributeImpl();
t.append("test");
assertEquals("test", t.toString());
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.charAt(-1);
- });
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.charAt(-1);
+ });
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.charAt(4);
- });
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.charAt(4);
+ });
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.subSequence(0, 5);
- });
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.subSequence(0, 5);
+ });
- expectThrows(IndexOutOfBoundsException.class, () -> {
- t.subSequence(5, 0);
- });
+ expectThrows(
+ IndexOutOfBoundsException.class,
+ () -> {
+ t.subSequence(5, 0);
+ });
}
public static <T extends AttributeImpl> T assertCloneIsEqual(T att) {
@@ -298,9 +328,9 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
assertEquals("Copied instance's hashcode must be equal", att.hashCode(), copy.hashCode());
return copy;
}
-
+
/*
-
+
// test speed of the dynamic instanceof checks in append(CharSequence),
// to find the best max length for the generic while (start<end) loop:
public void testAppendPerf() {
@@ -334,7 +364,7 @@ public class TestCharTermAttributeImpl extends LuceneTestCase {
long endTime = System.currentTimeMillis();
System.out.println("Time: " + (endTime-startTime)/1000.0 + " s");
}
-
+
*/
}
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java
index c673fc6..8e44096 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestPackedTokenAttributeImpl.java
@@ -16,21 +16,19 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
+import java.io.StringReader;
+import java.util.HashMap;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
-import java.io.StringReader;
-import java.util.HashMap;
-
public class TestPackedTokenAttributeImpl extends LuceneTestCase {
/* the CharTermAttributeStuff is tested by TestCharTermAttributeImpl */
-
+
public void testClone() throws Exception {
PackedTokenAttributeImpl t = new PackedTokenAttributeImpl();
t.setOffset(0, 5);
@@ -41,7 +39,7 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
assertEquals(t.toString(), copy.toString());
assertNotSame(buf, copy.buffer());
}
-
+
public void testCopyTo() throws Exception {
PackedTokenAttributeImpl t = new PackedTokenAttributeImpl();
PackedTokenAttributeImpl copy = TestCharTermAttributeImpl.assertCopyIsEqual(t);
@@ -57,42 +55,55 @@ public class TestPackedTokenAttributeImpl extends LuceneTestCase {
assertEquals(t.toString(), copy.toString());
assertNotSame(buf, copy.buffer());
}
-
+
public void testPackedTokenAttributeFactory() throws Exception {
- TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
- ((Tokenizer)ts).setReader(new StringReader("foo bar"));
-
- assertTrue("CharTermAttribute is not implemented by Token",
- ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
- assertTrue("OffsetAttribute is not implemented by Token",
- ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
- assertTrue("PositionIncrementAttribute is not implemented by Token",
- ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
- assertTrue("TypeAttribute is not implemented by Token",
- ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);
+ TokenStream ts =
+ new MockTokenizer(
+ TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY,
+ MockTokenizer.WHITESPACE,
+ false,
+ MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ ((Tokenizer) ts).setReader(new StringReader("foo bar"));
+
+ assertTrue(
+ "CharTermAttribute is not implemented by Token",
+ ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
+ assertTrue(
+ "OffsetAttribute is not implemented by Token",
+ ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
+ assertTrue(
+ "PositionIncrementAttribute is not implemented by Token",
+ ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
+ assertTrue(
+ "TypeAttribute is not implemented by Token",
+ ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);
- assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
- ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);
+ assertTrue(
+ "FlagsAttribute is not implemented by FlagsAttributeImpl",
+ ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);
}
public void testAttributeReflection() throws Exception {
PackedTokenAttributeImpl t = new PackedTokenAttributeImpl();
t.append("foobar");
- t.setOffset(6, 22);
+ t.setOffset(6, 22);
t.setPositionIncrement(3);
t.setPositionLength(11);
t.setType("foobar");
t.setTermFrequency(42);
- TestUtil.assertAttributeReflection(t,
- new HashMap<String, Object>() {{
- put(CharTermAttribute.class.getName() + "#term", "foobar");
- put(TermToBytesRefAttribute.class.getName() + "#bytes", new BytesRef("foobar"));
- put(OffsetAttribute.class.getName() + "#startOffset", 6);
- put(OffsetAttribute.class.getName() + "#endOffset", 22);
- put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
- put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
- put(TypeAttribute.class.getName() + "#type", "foobar");
- put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
- }});
+ TestUtil.assertAttributeReflection(
+ t,
+ new HashMap<String, Object>() {
+ {
+ put(CharTermAttribute.class.getName() + "#term", "foobar");
+ put(TermToBytesRefAttribute.class.getName() + "#bytes", new BytesRef("foobar"));
+ put(OffsetAttribute.class.getName() + "#startOffset", 6);
+ put(OffsetAttribute.class.getName() + "#endOffset", 22);
+ put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 3);
+ put(PositionLengthAttribute.class.getName() + "#positionLength", 11);
+ put(TypeAttribute.class.getName() + "#type", "foobar");
+ put(TermFrequencyAttribute.class.getName() + "#termFrequency", 42);
+ }
+ });
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
index 0f752da..f8832d9 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
@@ -16,33 +16,42 @@
*/
package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.LuceneTestCase;
-
import java.util.Collections;
import java.util.HashMap;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
public class TestSimpleAttributeImpl extends LuceneTestCase {
// this checks using reflection API if the defaults are correct
public void testAttributes() {
- TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
- Collections.singletonMap(PositionIncrementAttribute.class.getName() + "#positionIncrement", 1));
- TestUtil.assertAttributeReflection(new PositionLengthAttributeImpl(),
+ TestUtil.assertAttributeReflection(
+ new PositionIncrementAttributeImpl(),
+ Collections.singletonMap(
+ PositionIncrementAttribute.class.getName() + "#positionIncrement", 1));
+ TestUtil.assertAttributeReflection(
+ new PositionLengthAttributeImpl(),
Collections.singletonMap(PositionLengthAttribute.class.getName() + "#positionLength", 1));
- TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
+ TestUtil.assertAttributeReflection(
+ new FlagsAttributeImpl(),
Collections.singletonMap(FlagsAttribute.class.getName() + "#flags", 0));
- TestUtil.assertAttributeReflection(new TypeAttributeImpl(),
- Collections.singletonMap(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE));
- TestUtil.assertAttributeReflection(new PayloadAttributeImpl(),
+ TestUtil.assertAttributeReflection(
+ new TypeAttributeImpl(),
+ Collections.singletonMap(
+ TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE));
+ TestUtil.assertAttributeReflection(
+ new PayloadAttributeImpl(),
Collections.singletonMap(PayloadAttribute.class.getName() + "#payload", null));
- TestUtil.assertAttributeReflection(new KeywordAttributeImpl(),
+ TestUtil.assertAttributeReflection(
+ new KeywordAttributeImpl(),
Collections.singletonMap(KeywordAttribute.class.getName() + "#keyword", false));
- TestUtil.assertAttributeReflection(new OffsetAttributeImpl(), new HashMap<String, Object>() {{
- put(OffsetAttribute.class.getName() + "#startOffset", 0);
- put(OffsetAttribute.class.getName() + "#endOffset", 0);
- }});
+ TestUtil.assertAttributeReflection(
+ new OffsetAttributeImpl(),
+ new HashMap<String, Object>() {
+ {
+ put(OffsetAttribute.class.getName() + "#startOffset", 0);
+ put(OffsetAttribute.class.getName() + "#endOffset", 0);
+ }
+ });
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
index baae18a..90e63d9 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
@@ -16,16 +16,12 @@
*/
package org.apache.lucene.search.highlight;
-/**
- * Simple {@link Encoder} implementation that does not modify the output
- *
- */
+/** Simple {@link Encoder} implementation that does not modify the output */
public class DefaultEncoder implements Encoder {
- public DefaultEncoder() {
- }
+ public DefaultEncoder() {}
@Override
public String encodeText(String originalText) {
return originalText;
}
-}
\ No newline at end of file
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
index eb2a957..c181cc5 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
@@ -16,13 +16,8 @@
*/
package org.apache.lucene.search.highlight;
-/**
- * Encodes original text. The Encoder works with the {@link Formatter} to generate output.
- *
- */
+/** Encodes original text. The Encoder works with the {@link Formatter} to generate output. */
public interface Encoder {
- /**
- * @param originalText The section of text being output
- */
+ /** @param originalText The section of text being output */
String encodeText(String originalText);
-}
\ No newline at end of file
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
index 8b010ef..7d9190e 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
@@ -16,16 +16,14 @@
*/
package org.apache.lucene.search.highlight;
/**
- * Processes terms found in the original text, typically by applying some form
- * of mark-up to highlight terms in HTML search results pages.
- *
+ * Processes terms found in the original text, typically by applying some form of mark-up to
+ * highlight terms in HTML search results pages.
*/
-public interface Formatter
-{
+public interface Formatter {
/**
* @param originalText The section of text being considered for markup
- * @param tokenGroup contains one or several overlapping Tokens along with
- * their scores and positions.
+ * @param tokenGroup contains one or several overlapping Tokens along with their scores and
+ * positions.
*/
String highlightTerm(String originalText, TokenGroup tokenGroup);
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
index 8c0187c..1e79129 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
@@ -15,31 +15,28 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
+
import org.apache.lucene.analysis.TokenStream;
/**
- * Implements the policy for breaking text into multiple fragments for
- * consideration by the {@link Highlighter} class. A sophisticated
- * implementation may do this on the basis of detecting end of sentences in the
- * text.
+ * Implements the policy for breaking text into multiple fragments for consideration by the {@link
+ * Highlighter} class. A sophisticated implementation may do this on the basis of detecting end of
+ * sentences in the text.
*/
public interface Fragmenter {
/**
- * Initializes the Fragmenter. You can grab references to the Attributes you are
- * interested in from tokenStream and then access the values in {@link #isNewFragment()}.
- *
+ * Initializes the Fragmenter. You can grab references to the Attributes you are interested in
+ * from tokenStream and then access the values in {@link #isNewFragment()}.
+ *
* @param originalText the original source text
* @param tokenStream the {@link TokenStream} to be fragmented
*/
public void start(String originalText, TokenStream tokenStream);
-
/**
- * Test to see if this token from the stream should be held in a new
- * TextFragment. Every time this is called, the TokenStream
- * passed to start(String, TokenStream) will have been incremented.
- *
+ * Test to see if this token from the stream should be held in a new TextFragment. Every time this
+ * is called, the TokenStream passed to start(String, TokenStream) will have been incremented.
*/
public boolean isNewFragment();
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java
index 8d4250a..9178f44 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java
@@ -15,213 +15,171 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
-/**
- * Formats text with different color intensity depending on the score of the
- * term.
- *
- */
-public class GradientFormatter implements Formatter
-{
- private float maxScore;
-
- int fgRMin, fgGMin, fgBMin;
-
- int fgRMax, fgGMax, fgBMax;
-
- protected boolean highlightForeground;
-
- int bgRMin, bgGMin, bgBMin;
-
- int bgRMax, bgGMax, bgBMax;
-
- protected boolean highlightBackground;
-
- /**
- * Sets the color range for the IDF scores
- *
- * @param maxScore
- * The score (and above) displayed as maxColor (See {@link QueryScorer#getMaxTermWeight()}
- * which can be used to calibrate scoring scale)
- * @param minForegroundColor
- * The hex color used for representing IDF scores of zero eg
- * #FFFFFF (white) or null if no foreground color required
- * @param maxForegroundColor
- * The largest hex color used for representing IDF scores eg
- * #000000 (black) or null if no foreground color required
- * @param minBackgroundColor
- * The hex color used for representing IDF scores of zero eg
- * #FFFFFF (white) or null if no background color required
- * @param maxBackgroundColor
- * The largest hex color used for representing IDF scores eg
- * #000000 (black) or null if no background color required
- */
- public GradientFormatter(float maxScore, String minForegroundColor,
- String maxForegroundColor, String minBackgroundColor,
- String maxBackgroundColor)
- {
- highlightForeground = (minForegroundColor != null)
- && (maxForegroundColor != null);
- if (highlightForeground)
- {
- if (minForegroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minForegroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- if (maxForegroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minForegroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- fgRMin = hexToInt(minForegroundColor.substring(1, 3));
- fgGMin = hexToInt(minForegroundColor.substring(3, 5));
- fgBMin = hexToInt(minForegroundColor.substring(5, 7));
-
- fgRMax = hexToInt(maxForegroundColor.substring(1, 3));
- fgGMax = hexToInt(maxForegroundColor.substring(3, 5));
- fgBMax = hexToInt(maxForegroundColor.substring(5, 7));
- }
-
- highlightBackground = (minBackgroundColor != null)
- && (maxBackgroundColor != null);
- if (highlightBackground)
- {
- if (minBackgroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minBackgroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- if (maxBackgroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minBackgroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- bgRMin = hexToInt(minBackgroundColor.substring(1, 3));
- bgGMin = hexToInt(minBackgroundColor.substring(3, 5));
- bgBMin = hexToInt(minBackgroundColor.substring(5, 7));
-
- bgRMax = hexToInt(maxBackgroundColor.substring(1, 3));
- bgGMax = hexToInt(maxBackgroundColor.substring(3, 5));
- bgBMax = hexToInt(maxBackgroundColor.substring(5, 7));
- }
- // this.corpusReader = corpusReader;
- this.maxScore = maxScore;
- // totalNumDocs = corpusReader.numDocs();
+/** Formats text with different color intensity depending on the score of the term. */
+public class GradientFormatter implements Formatter {
+ private float maxScore;
+
+ int fgRMin, fgGMin, fgBMin;
+ int fgRMax, fgGMax, fgBMax;
+
+ protected boolean highlightForeground;
+
+ int bgRMin, bgGMin, bgBMin;
+ int bgRMax, bgGMax, bgBMax;
+
+ protected boolean highlightBackground;
+
+ /**
+ * Sets the color range for the IDF scores
+ *
+ * @param maxScore The score (and above) displayed as maxColor (See {@link
+ * QueryScorer#getMaxTermWeight()} which can be used to calibrate scoring scale)
+ * @param minForegroundColor The hex color used for representing IDF scores of zero eg #FFFFFF
+ * (white) or null if no foreground color required
+ * @param maxForegroundColor The largest hex color used for representing IDF scores eg #000000
+ * (black) or null if no foreground color required
+ * @param minBackgroundColor The hex color used for representing IDF scores of zero eg #FFFFFF
+ * (white) or null if no background color required
+ * @param maxBackgroundColor The largest hex color used for representing IDF scores eg #000000
+ * (black) or null if no background color required
+ */
+ public GradientFormatter(
+ float maxScore,
+ String minForegroundColor,
+ String maxForegroundColor,
+ String minBackgroundColor,
+ String maxBackgroundColor) {
+ highlightForeground = (minForegroundColor != null) && (maxForegroundColor != null);
+ if (highlightForeground) {
+ if (minForegroundColor.length() != 7) {
+ throw new IllegalArgumentException(
+ "minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ if (maxForegroundColor.length() != 7) {
+ throw new IllegalArgumentException(
+ "minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ fgRMin = hexToInt(minForegroundColor.substring(1, 3));
+ fgGMin = hexToInt(minForegroundColor.substring(3, 5));
+ fgBMin = hexToInt(minForegroundColor.substring(5, 7));
+
+ fgRMax = hexToInt(maxForegroundColor.substring(1, 3));
+ fgGMax = hexToInt(maxForegroundColor.substring(3, 5));
+ fgBMax = hexToInt(maxForegroundColor.substring(5, 7));
}
- @Override
- public String highlightTerm(String originalText, TokenGroup tokenGroup)
- {
- if (tokenGroup.getTotalScore() == 0)
- return originalText;
- float score = tokenGroup.getTotalScore();
- if (score == 0)
- {
- return originalText;
- }
- StringBuilder sb = new StringBuilder();
- sb.append("<font ");
- if (highlightForeground)
- {
- sb.append("color=\"");
- sb.append(getForegroundColorString(score));
- sb.append("\" ");
- }
- if (highlightBackground)
- {
- sb.append("bgcolor=\"");
- sb.append(getBackgroundColorString(score));
- sb.append("\" ");
- }
- sb.append(">");
- sb.append(originalText);
- sb.append("</font>");
- return sb.toString();
+ highlightBackground = (minBackgroundColor != null) && (maxBackgroundColor != null);
+ if (highlightBackground) {
+ if (minBackgroundColor.length() != 7) {
+ throw new IllegalArgumentException(
+ "minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ if (maxBackgroundColor.length() != 7) {
+ throw new IllegalArgumentException(
+ "minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF");
+ }
+ bgRMin = hexToInt(minBackgroundColor.substring(1, 3));
+ bgGMin = hexToInt(minBackgroundColor.substring(3, 5));
+ bgBMin = hexToInt(minBackgroundColor.substring(5, 7));
+
+ bgRMax = hexToInt(maxBackgroundColor.substring(1, 3));
+ bgGMax = hexToInt(maxBackgroundColor.substring(3, 5));
+ bgBMax = hexToInt(maxBackgroundColor.substring(5, 7));
}
-
- protected String getForegroundColorString(float score)
- {
- int rVal = getColorVal(fgRMin, fgRMax, score);
- int gVal = getColorVal(fgGMin, fgGMax, score);
- int bVal = getColorVal(fgBMin, fgBMax, score);
- StringBuilder sb = new StringBuilder();
- sb.append("#");
- sb.append(intToHex(rVal));
- sb.append(intToHex(gVal));
- sb.append(intToHex(bVal));
- return sb.toString();
+ // this.corpusReader = corpusReader;
+ this.maxScore = maxScore;
+ // totalNumDocs = corpusReader.numDocs();
+ }
+
+ @Override
+ public String highlightTerm(String originalText, TokenGroup tokenGroup) {
+ if (tokenGroup.getTotalScore() == 0) return originalText;
+ float score = tokenGroup.getTotalScore();
+ if (score == 0) {
+ return originalText;
}
-
- protected String getBackgroundColorString(float score)
- {
- int rVal = getColorVal(bgRMin, bgRMax, score);
- int gVal = getColorVal(bgGMin, bgGMax, score);
- int bVal = getColorVal(bgBMin, bgBMax, score);
- StringBuilder sb = new StringBuilder();
- sb.append("#");
- sb.append(intToHex(rVal));
- sb.append(intToHex(gVal));
- sb.append(intToHex(bVal));
- return sb.toString();
+ StringBuilder sb = new StringBuilder();
+ sb.append("<font ");
+ if (highlightForeground) {
+ sb.append("color=\"");
+ sb.append(getForegroundColorString(score));
+ sb.append("\" ");
}
-
- private int getColorVal(int colorMin, int colorMax, float score)
- {
- if (colorMin == colorMax)
- {
- return colorMin;
- }
- float scale = Math.abs(colorMin - colorMax);
- float relScorePercent = Math.min(maxScore, score) / maxScore;
- float colScore = scale * relScorePercent;
- return Math.min(colorMin, colorMax) + (int) colScore;
+ if (highlightBackground) {
+ sb.append("bgcolor=\"");
+ sb.append(getBackgroundColorString(score));
+ sb.append("\" ");
}
-
- private static char hexDigits[] = { '0', '1', '2', '3', '4', '5', '6', '7',
- '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
-
- private static String intToHex(int i)
- {
- return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F];
+ sb.append(">");
+ sb.append(originalText);
+ sb.append("</font>");
+ return sb.toString();
+ }
+
+ protected String getForegroundColorString(float score) {
+ int rVal = getColorVal(fgRMin, fgRMax, score);
+ int gVal = getColorVal(fgGMin, fgGMax, score);
+ int bVal = getColorVal(fgBMin, fgBMax, score);
+ StringBuilder sb = new StringBuilder();
+ sb.append("#");
+ sb.append(intToHex(rVal));
+ sb.append(intToHex(gVal));
+ sb.append(intToHex(bVal));
+ return sb.toString();
+ }
+
+ protected String getBackgroundColorString(float score) {
+ int rVal = getColorVal(bgRMin, bgRMax, score);
+ int gVal = getColorVal(bgGMin, bgGMax, score);
+ int bVal = getColorVal(bgBMin, bgBMax, score);
+ StringBuilder sb = new StringBuilder();
+ sb.append("#");
+ sb.append(intToHex(rVal));
+ sb.append(intToHex(gVal));
+ sb.append(intToHex(bVal));
+ return sb.toString();
+ }
+
+ private int getColorVal(int colorMin, int colorMax, float score) {
+ if (colorMin == colorMax) {
+ return colorMin;
}
-
- /**
- * Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the
- * input is nonnegative unless there is a preceding minus sign. This method
- * reads the input as twos complement instead, so if the input is 8 bytes
- * long, it will correctly restore a negative int produced by
- * Integer.toHexString() but not necessarily one produced by
- * Integer.toString(x,16) since that method will produce a string like '-FF'
- * for negative integer values.
- *
- * @param hex
- * A string in capital or lower case hex, of no more then 16
- * characters.
- * @throws NumberFormatException
- * if the string is more than 16 characters long, or if any
- * character is not in the set [0-9a-fA-f]
- */
- public static final int hexToInt(String hex)
- {
- int len = hex.length();
- if (len > 16)
- throw new NumberFormatException();
-
- int l = 0;
- for (int i = 0; i < len; i++)
- {
- l <<= 4;
- int c = Character.digit(hex.charAt(i), 16);
- if (c < 0)
- throw new NumberFormatException();
- l |= c;
- }
- return l;
+ float scale = Math.abs(colorMin - colorMax);
+ float relScorePercent = Math.min(maxScore, score) / maxScore;
+ float colScore = scale * relScorePercent;
+ return Math.min(colorMin, colorMax) + (int) colScore;
+ }
+
+ private static char hexDigits[] = {
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
+ };
+
+ private static String intToHex(int i) {
+ return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F];
+ }
+
+ /**
+ * Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the input is nonnegative
+ * unless there is a preceding minus sign. This method reads the input as twos complement instead,
+ * so if the input is 8 bytes long, it will correctly restore a negative int produced by
+ * Integer.toHexString() but not necessarily one produced by Integer.toString(x,16) since that
+ * method will produce a string like '-FF' for negative integer values.
+ *
+ * @param hex A string in capital or lower case hex, of no more then 16 characters.
+ * @throws NumberFormatException if the string is more than 16 characters long, or if any
+ * character is not in the set [0-9a-fA-f]
+ */
+ public static final int hexToInt(String hex) {
+ int len = hex.length();
+ if (len > 16) throw new NumberFormatException();
+
+ int l = 0;
+ for (int i = 0; i < len; i++) {
+ l <<= 4;
+ int c = Character.digit(hex.charAt(i), 16);
+ if (c < 0) throw new NumberFormatException();
+ l |= c;
}
-
+ return l;
+ }
}
-
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
index 6f3ea78..b0dee91 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Objects;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -28,15 +27,13 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.PriorityQueue;
/**
- * Marks up highlighted terms found in the best sections of
- * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
- * {@link Encoder} and tokenizers.
+ * Marks up highlighted terms found in the best sections of text, using configurable {@link
+ * Fragmenter}, {@link Scorer}, {@link Formatter}, {@link Encoder} and tokenizers.
*
- * This is Lucene's original Highlighter; there are others.
+ * <p>This is Lucene's original Highlighter; there are others.
*/
-public class Highlighter
-{
- public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
+public class Highlighter {
+ public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50 * 1024;
private Formatter formatter;
private Encoder encoder;
@@ -44,18 +41,15 @@ public class Highlighter
private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
private Fragmenter textFragmenter = new SimpleFragmenter();
- public Highlighter(Scorer fragmentScorer)
- {
- this(new SimpleHTMLFormatter(),fragmentScorer);
+ public Highlighter(Scorer fragmentScorer) {
+ this(new SimpleHTMLFormatter(), fragmentScorer);
}
- public Highlighter(Formatter formatter, Scorer fragmentScorer)
- {
- this(formatter,new DefaultEncoder(),fragmentScorer);
+ public Highlighter(Formatter formatter, Scorer fragmentScorer) {
+ this(formatter, new DefaultEncoder(), fragmentScorer);
}
- public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
- {
+ public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) {
ensureArgumentNotNull(formatter, "'formatter' must not be null");
ensureArgumentNotNull(encoder, "'encoder' must not be null");
ensureArgumentNotNull(fragmentScorer, "'fragmentScorer' must not be null");
@@ -66,140 +60,118 @@ public class Highlighter
}
/**
- * Highlights chosen terms in a text, extracting the most relevant section.
- * This is a convenience method that calls
- * {@link #getBestFragment(TokenStream, String)}
+ * Highlights chosen terms in a text, extracting the most relevant section. This is a convenience
+ * method that calls {@link #getBestFragment(TokenStream, String)}
*
- * @param analyzer the analyzer that will be used to split <code>text</code>
- * into chunks
+ * @param analyzer the analyzer that will be used to split <code>text</code> into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
- *
* @return highlighted text fragment or null if no terms found
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
- public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
- throws IOException, InvalidTokenOffsetsException
- {
+ public final String getBestFragment(Analyzer analyzer, String fieldName, String text)
+ throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
return getBestFragment(tokenStream, text);
}
/**
- * Highlights chosen terms in a text, extracting the most relevant section.
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragment with the highest score
- * is returned
+ * Highlights chosen terms in a text, extracting the most relevant section. The document text is
+ * analysed in chunks to record hit statistics across the document. After accumulating stats, the
+ * fragment with the highest score is returned
*
- * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
- * This is typically produced by an analyzer re-parsing a document's
- * text. Some work may be done on retrieving TokenStreams more efficiently
- * by adding support for storing original text position data in the Lucene
- * index but this support is not currently available (as of Lucene 1.4 rc2).
+ * @param tokenStream a stream of tokens identified in the text parameter, including offset
+ * information. This is typically produced by an analyzer re-parsing a document's text. Some
+ * work may be done on retrieving TokenStreams more efficiently by adding support for storing
+ * original text position data in the Lucene index but this support is not currently available
+ * (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
- *
* @return highlighted text fragment or null if no terms found
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
public final String getBestFragment(TokenStream tokenStream, String text)
- throws IOException, InvalidTokenOffsetsException
- {
- String[] results = getBestFragments(tokenStream,text, 1);
- if (results.length > 0)
- {
+ throws IOException, InvalidTokenOffsetsException {
+ String[] results = getBestFragments(tokenStream, text, 1);
+ if (results.length > 0) {
return results[0];
}
return null;
}
/**
- * Highlights chosen terms in a text, extracting the most relevant sections.
- * This is a convenience method that calls
- * {@link #getBestFragments(TokenStream, String, int)}
- *
- * @param analyzer the analyzer that will be used to split <code>text</code>
- * into chunks
- * @param fieldName the name of the field being highlighted (used by analyzer)
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
+ * Highlights chosen terms in a text, extracting the most relevant sections. This is a convenience
+ * method that calls {@link #getBestFragments(TokenStream, String, int)}
*
+ * @param analyzer the analyzer that will be used to split <code>text</code> into chunks
+ * @param fieldName the name of the field being highlighted (used by analyzer)
+ * @param text text to highlight terms in
+ * @param maxNumFragments the maximum number of fragments.
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
public final String[] getBestFragments(
- Analyzer analyzer,
- String fieldName,
- String text,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
+ Analyzer analyzer, String fieldName, String text, int maxNumFragments)
+ throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
- * Highlights chosen terms in a text, extracting the most relevant sections.
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragments with the highest scores
- * are returned as an array of strings in order of score (contiguous fragments are merged into
- * one in their original order to improve readability)
- *
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
+ * Highlights chosen terms in a text, extracting the most relevant sections. The document text is
+ * analysed in chunks to record hit statistics across the document. After accumulating stats, the
+ * fragments with the highest scores are returned as an array of strings in order of score
+ * (contiguous fragments are merged into one in their original order to improve readability)
*
+ * @param text text to highlight terms in
+ * @param maxNumFragments the maximum number of fragments.
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
- public final String[] getBestFragments(
- TokenStream tokenStream,
- String text,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
- maxNumFragments = Math.max(1, maxNumFragments); //sanity check
+ public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
+ throws IOException, InvalidTokenOffsetsException {
+ maxNumFragments = Math.max(1, maxNumFragments); // sanity check
- TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
+ TextFragment[] frag = getBestTextFragments(tokenStream, text, true, maxNumFragments);
- //Get text
+ // Get text
ArrayList<String> fragTexts = new ArrayList<>();
- for (int i = 0; i < frag.length; i++)
- {
- if ((frag[i] != null) && (frag[i].getScore() > 0))
- {
+ for (int i = 0; i < frag.length; i++) {
+ if ((frag[i] != null) && (frag[i].getScore() > 0)) {
fragTexts.add(frag[i].toString());
}
}
return fragTexts.toArray(new String[0]);
}
-
/**
- * Low level api to get the most relevant (formatted) sections of the document.
- * This method has been made public to allow visibility of score information held in TextFragment objects.
- * Thanks to Jason Calabrese for help in redefining the interface.
+ * Low level api to get the most relevant (formatted) sections of the document. This method has
+ * been made public to allow visibility of score information held in TextFragment objects. Thanks
+ * to Jason Calabrese for help in redefining the interface.
+ *
* @throws IOException If there is a low-level I/O error
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
public final TextFragment[] getBestTextFragments(
- TokenStream tokenStream,
- String text,
- boolean mergeContiguousFragments,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
+ TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments)
+ throws IOException, InvalidTokenOffsetsException {
ArrayList<TextFragment> docFrags = new ArrayList<>();
- StringBuilder newText=new StringBuilder();
+ StringBuilder newText = new StringBuilder();
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
- TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
+ TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
}
TokenStream newStream = fragmentScorer.init(tokenStream);
- if(newStream != null) {
+ if (newStream != null) {
tokenStream = newStream;
}
fragmentScorer.startFragment(currentFrag);
@@ -207,8 +179,7 @@ public class Highlighter
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
- try
- {
+ try {
String tokenText;
int startOffset;
@@ -216,42 +187,39 @@ public class Highlighter
int lastEndOffset = 0;
textFragmenter.start(text, tokenStream);
- TokenGroup tokenGroup=new TokenGroup(tokenStream);
+ TokenGroup tokenGroup = new TokenGroup(tokenStream);
tokenStream.reset();
- for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
- next = tokenStream.incrementToken())
- {
- if( (offsetAtt.endOffset()>text.length())
- ||
- (offsetAtt.startOffset()>text.length())
- )
- {
- throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
- +" exceeds length of provided text sized "+text.length());
+ for (boolean next = tokenStream.incrementToken();
+ next && (offsetAtt.startOffset() < maxDocCharsToAnalyze);
+ next = tokenStream.incrementToken()) {
+ if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
+ throw new InvalidTokenOffsetsException(
+ "Token "
+ + termAtt.toString()
+ + " exceeds length of provided text sized "
+ + text.length());
}
- if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct()))
- {
- //the current token is distinct from previous tokens -
+ if ((tokenGroup.getNumTokens() > 0) && (tokenGroup.isDistinct())) {
+ // the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
- String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
+ String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
+ // store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
- lastEndOffset=Math.max(endOffset, lastEndOffset);
+ lastEndOffset = Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
- //check if current token marks the start of a new fragment
- if(textFragmenter.isNewFragment())
- {
+ // check if current token marks the start of a new fragment
+ if (textFragmenter.isNewFragment()) {
currentFrag.setScore(fragmentScorer.getFragmentScore());
- //record stats for a new fragment
+ // record stats for a new fragment
currentFrag.textEndPos = newText.length();
- currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
+ currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
@@ -259,48 +227,44 @@ public class Highlighter
tokenGroup.addToken(fragmentScorer.getTokenScore());
-// if(lastEndOffset>maxDocBytesToAnalyze)
-// {
-// break;
-// }
+ // if(lastEndOffset>maxDocBytesToAnalyze)
+ // {
+ // break;
+ // }
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
- if(tokenGroup.getNumTokens() >0)
- {
- //flush the accumulated text (same code as in above loop)
+ if (tokenGroup.getNumTokens() > 0) {
+ // flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset);
- String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
+ String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
+ // store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
- lastEndOffset=Math.max(lastEndOffset,endOffset);
+ lastEndOffset = Math.max(lastEndOffset, endOffset);
}
- //Test what remains of the original text beyond the point where we stopped analyzing
+ // Test what remains of the original text beyond the point where we stopped analyzing
if (
-// if there is text beyond the last token considered..
- (lastEndOffset < text.length())
+ // if there is text beyond the last token considered..
+ (lastEndOffset < text.length())
&&
-// and that text is not too large...
- (text.length()<= maxDocCharsToAnalyze)
- )
- {
- //append it to the last fragment
+ // and that text is not too large...
+ (text.length() <= maxDocCharsToAnalyze)) {
+ // append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
currentFrag.textEndPos = newText.length();
- //sort the most relevant sections of the text
- for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
- {
+ // sort the most relevant sections of the text
+ for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext(); ) {
currentFrag = i.next();
- //If you are running with a version of Lucene before 11th Sept 03
+ // If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
@@ -315,83 +279,66 @@ public class Highlighter
}
*/
- //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
- //fix to PriorityQueue. The correct method to use here is the new "insert" method
+ // The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+ // fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insertWithOverflow(currentFrag);
}
- //return the most relevant fragments
+ // return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
- for (int i = frag.length - 1; i >= 0; i--)
- {
+ for (int i = frag.length - 1; i >= 0; i--) {
frag[i] = fragQueue.pop();
}
- //merge any contiguous fragments to improve readability
- if(mergeContiguousFragments)
- {
+ // merge any contiguous fragments to improve readability
+ if (mergeContiguousFragments) {
mergeContiguousFragments(frag);
ArrayList<TextFragment> fragTexts = new ArrayList<>();
- for (int i = 0; i < frag.length; i++)
- {
- if ((frag[i] != null) && (frag[i].getScore() > 0))
- {
+ for (int i = 0; i < frag.length; i++) {
+ if ((frag[i] != null) && (frag[i].getScore() > 0)) {
fragTexts.add(frag[i]);
}
}
- frag= fragTexts.toArray(new TextFragment[0]);
+ frag = fragTexts.toArray(new TextFragment[0]);
}
return frag;
- }
- finally
- {
- if (tokenStream != null)
- {
- try
- {
+ } finally {
+ if (tokenStream != null) {
+ try {
tokenStream.end();
tokenStream.close();
- }
- catch (Exception e)
- {
+ } catch (Exception e) {
}
}
}
}
-
- /** Improves readability of a score-sorted list of TextFragments by merging any fragments
- * that were contiguous in the original text into one larger fragment with the correct order.
- * This will leave a "null" in the array entry for the lesser scored fragment.
+ /**
+ * Improves readability of a score-sorted list of TextFragments by merging any fragments that were
+ * contiguous in the original text into one larger fragment with the correct order. This will
+ * leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
- private void mergeContiguousFragments(TextFragment[] frag)
- {
+ private void mergeContiguousFragments(TextFragment[] frag) {
boolean mergingStillBeingDone;
if (frag.length > 1)
- do
- {
- mergingStillBeingDone = false; //initialise loop control flag
- //for each fragment, scan other frags looking for contiguous blocks
- for (int i = 0; i < frag.length; i++)
- {
- if (frag[i] == null)
- {
+ do {
+ mergingStillBeingDone = false; // initialise loop control flag
+ // for each fragment, scan other frags looking for contiguous blocks
+ for (int i = 0; i < frag.length; i++) {
+ if (frag[i] == null) {
continue;
}
- //merge any contiguous blocks
- for (int x = 0; x < frag.length; x++)
- {
- if (frag[x] == null)
- {
+ // merge any contiguous blocks
+ for (int x = 0; x < frag.length; x++) {
+ if (frag[x] == null) {
continue;
}
- if (frag[i] == null)
- {
+ if (frag[i] == null) {
break;
}
TextFragment frag1 = null;
@@ -400,32 +347,24 @@ public class Highlighter
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
- //if blocks are contiguous....
- if (frag[i].follows(frag[x]))
- {
+ // if blocks are contiguous....
+ if (frag[i].follows(frag[x])) {
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
+ } else if (frag[x].follows(frag[i])) {
+ frag1 = frag[i];
+ frag1Num = i;
+ frag2 = frag[x];
+ frag2Num = x;
}
- else
- if (frag[x].follows(frag[i]))
- {
- frag1 = frag[i];
- frag1Num = i;
- frag2 = frag[x];
- frag2Num = x;
- }
- //merging required..
- if (frag1 != null)
- {
- if (frag1.getScore() > frag2.getScore())
- {
+ // merging required..
+ if (frag1 != null) {
+ if (frag1.getScore() > frag2.getScore()) {
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
- }
- else
- {
+ } else {
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
@@ -436,38 +375,29 @@ public class Highlighter
}
}
}
- }
- while (mergingStillBeingDone);
+ } while (mergingStillBeingDone);
}
-
/**
- * Highlights terms in the text , extracting the most relevant sections
- * and concatenating the chosen fragments with a separator (typically "...").
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragments with the highest scores
- * are returned in order as "separator" delimited strings.
- *
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
- * @param separator the separator used to intersperse the document fragments (typically "...")
+ * Highlights terms in the text , extracting the most relevant sections and concatenating the
+ * chosen fragments with a separator (typically "..."). The document text is analysed in chunks to
+ * record hit statistics across the document. After accumulating stats, the fragments with the
+ * highest scores are returned in order as "separator" delimited strings.
*
+ * @param text text to highlight terms in
+ * @param maxNumFragments the maximum number of fragments.
+ * @param separator the separator used to intersperse the document fragments (typically "...")
* @return highlighted text
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+ * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
+ * text's length
*/
public final String getBestFragments(
- TokenStream tokenStream,
- String text,
- int maxNumFragments,
- String separator)
- throws IOException, InvalidTokenOffsetsException
- {
- String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
+ TokenStream tokenStream, String text, int maxNumFragments, String separator)
+ throws IOException, InvalidTokenOffsetsException {
+ String sections[] = getBestFragments(tokenStream, text, maxNumFragments);
StringBuilder result = new StringBuilder();
- for (int i = 0; i < sections.length; i++)
- {
- if (i > 0)
- {
+ for (int i = 0; i < sections.length; i++) {
+ if (i > 0) {
result.append(separator);
}
result.append(sections[i]);
@@ -483,26 +413,20 @@ public class Highlighter
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
}
- public Fragmenter getTextFragmenter()
- {
+ public Fragmenter getTextFragmenter() {
return textFragmenter;
}
- public void setTextFragmenter(Fragmenter fragmenter)
- {
+ public void setTextFragmenter(Fragmenter fragmenter) {
textFragmenter = Objects.requireNonNull(fragmenter);
}
- /**
- * @return Object used to score each text fragment
- */
- public Scorer getFragmentScorer()
- {
+ /** @return Object used to score each text fragment */
+ public Scorer getFragmentScorer() {
return fragmentScorer;
}
- public void setFragmentScorer(Scorer scorer)
- {
+ public void setFragmentScorer(Scorer scorer) {
fragmentScorer = Objects.requireNonNull(scorer);
}
@@ -518,7 +442,7 @@ public class Highlighter
* Throws an IllegalArgumentException with the provided message if 'argument' is null.
*
* @param argument the argument to be null-checked
- * @param message the message of the exception thrown if argument == null
+ * @param message the message of the exception thrown if argument == null
*/
private static void ensureArgumentNotNull(Object argument, String message) {
if (argument == null) {
@@ -526,20 +450,15 @@ public class Highlighter
}
}
- static class FragmentQueue extends PriorityQueue<TextFragment>
- {
- FragmentQueue(int size)
- {
+ static class FragmentQueue extends PriorityQueue<TextFragment> {
+ FragmentQueue(int size) {
super(size);
}
@Override
- public final boolean lessThan(TextFragment fragA, TextFragment fragB)
- {
- if (fragA.getScore() == fragB.getScore())
- return fragA.fragNum > fragB.fragNum;
- else
- return fragA.getScore() < fragB.getScore();
+ public final boolean lessThan(TextFragment fragA, TextFragment fragB) {
+ if (fragA.getScore() == fragB.getScore()) return fragA.fragNum > fragB.fragNum;
+ else return fragA.getScore() < fragB.getScore();
}
}
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
index fe353a2..8c7f961 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
@@ -15,16 +15,10 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
-/**
- * Exception thrown if TokenStream Tokens are incompatible with provided text
- *
- */
-public class InvalidTokenOffsetsException extends Exception
-{
+/** Exception thrown if TokenStream Tokens are incompatible with provided text */
+public class InvalidTokenOffsetsException extends Exception {
- public InvalidTokenOffsetsException(String message)
- {
+ public InvalidTokenOffsetsException(String message) {
super(message);
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/LimitTokenOffsetFilter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/LimitTokenOffsetFilter.java
index 3976357..fc27a03 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/LimitTokenOffsetFilter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/LimitTokenOffsetFilter.java
@@ -17,14 +17,13 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
- * This is a simplified version of org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter to prevent
- * a dependency on analysis-common.jar.
+ * This is a simplified version of org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter
+ * to prevent a dependency on analysis-common.jar.
*/
final class LimitTokenOffsetFilter extends TokenFilter {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
index e605221..73736e9 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
@@ -15,20 +15,19 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
+
import org.apache.lucene.analysis.TokenStream;
/**
- * {@link Fragmenter} implementation which does not fragment the text.
- * This is useful for highlighting the entire content of a document or field.
+ * {@link Fragmenter} implementation which does not fragment the text. This is useful for
+ * highlighting the entire content of a document or field.
*/
public class NullFragmenter implements Fragmenter {
@Override
- public void start(String s, TokenStream tokenStream) {
- }
+ public void start(String s, TokenStream tokenStream) {}
@Override
public boolean isNewFragment() {
return false;
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
index 23557b5..ba598f7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
@@ -17,26 +17,22 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-/**
- * This TokenFilter limits the number of tokens while indexing by adding up the
- * current offset.
- */
+/** This TokenFilter limits the number of tokens while indexing by adding up the current offset. */
public final class OffsetLimitTokenFilter extends TokenFilter {
-
+
private int offsetCount;
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
private int offsetLimit;
-
+
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
super(input);
this.offsetLimit = offsetLimit;
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (offsetCount < offsetLimit && input.incrementToken()) {
@@ -46,11 +42,10 @@ public final class OffsetLimitTokenFilter extends TokenFilter {
}
return false;
}
-
+
@Override
public void reset() throws IOException {
super.reset();
offsetCount = 0;
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java
index ac7a9d0..c08abc7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/PositionSpan.java
@@ -17,6 +17,7 @@
package org.apache.lucene.search.highlight;
/**
* Utility class to record Positions Spans
+ *
* @lucene.internal
*/
public class PositionSpan {
@@ -27,4 +28,4 @@ public class PositionSpan {
this.start = start;
this.end = end;
}
-}
\ No newline at end of file
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
index 5452fc7..977dabb 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
@@ -21,7 +21,6 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -32,15 +31,14 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
/**
- * {@link Scorer} implementation which scores text fragments by the number of
- * unique query terms found. This class converts appropriate {@link Query}s to
- * {@link SpanQuery}s and attempts to score only those terms that participated in
- * generating the 'hit' on the document.
+ * {@link Scorer} implementation which scores text fragments by the number of unique query terms
+ * found. This class converts appropriate {@link Query}s to {@link SpanQuery}s and attempts to score
+ * only those terms that participated in generating the 'hit' on the document.
*/
public class QueryScorer implements Scorer {
private float totalScore;
private Set<String> foundTerms;
- private Map<String,WeightedSpanTerm> fieldWeightedSpanTerms;
+ private Map<String, WeightedSpanTerm> fieldWeightedSpanTerms;
private float maxTermWeight;
private int position = -1;
private String defaultField;
@@ -55,9 +53,7 @@ public class QueryScorer implements Scorer {
private int maxCharsToAnalyze;
private boolean usePayloads = false;
- /**
- * @param query Query to use for highlighting
- */
+ /** @param query Query to use for highlighting */
public QueryScorer(Query query) {
init(query, null, null, true);
}
@@ -79,7 +75,6 @@ public class QueryScorer implements Scorer {
init(query, field, reader, true);
}
-
/**
* @param query to use for highlighting
* @param reader {@link IndexReader} to use for quasi tf/idf scoring
@@ -90,25 +85,20 @@ public class QueryScorer implements Scorer {
init(query, field, reader, true);
}
- /**
- * @param defaultField - The default field for queries with the field name unspecified
- */
+ /** @param defaultField - The default field for queries with the field name unspecified */
public QueryScorer(Query query, String field, String defaultField) {
this.defaultField = defaultField;
init(query, field, null, true);
}
- /**
- * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s
- */
+ /** @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s */
public QueryScorer(WeightedSpanTerm[] weightedTerms) {
this.fieldWeightedSpanTerms = new HashMap<>(weightedTerms.length);
for (int i = 0; i < weightedTerms.length; i++) {
WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term);
- if ((existingTerm == null) ||
- (existingTerm.weight < weightedTerms[i].weight)) {
+ if ((existingTerm == null) || (existingTerm.weight < weightedTerms[i].weight)) {
// if a term is defined more than once, always use the highest
// scoring weight
fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
@@ -129,9 +119,8 @@ public class QueryScorer implements Scorer {
}
/**
- *
- * @return The highest weighted term (useful for passing to
- * GradientFormatter to set top end of coloring scale).
+ * @return The highest weighted term (useful for passing to GradientFormatter to set top end of
+ * coloring scale).
*/
public float getMaxTermWeight() {
return maxTermWeight;
@@ -150,13 +139,11 @@ public class QueryScorer implements Scorer {
WeightedSpanTerm weightedSpanTerm;
- if ((weightedSpanTerm = fieldWeightedSpanTerms.get(
- termText)) == null) {
+ if ((weightedSpanTerm = fieldWeightedSpanTerms.get(termText)) == null) {
return 0;
}
- if (weightedSpanTerm.positionSensitive &&
- !weightedSpanTerm.checkPosition(position)) {
+ if (weightedSpanTerm.positionSensitive && !weightedSpanTerm.checkPosition(position)) {
return 0;
}
@@ -179,18 +166,18 @@ public class QueryScorer implements Scorer {
position = -1;
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
- if(!skipInitExtractor) {
- if(fieldWeightedSpanTerms != null) {
+ if (!skipInitExtractor) {
+ if (fieldWeightedSpanTerms != null) {
fieldWeightedSpanTerms.clear();
}
return initExtractor(tokenStream);
}
return null;
}
-
+
/**
- * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing
- * Span information to a {@link Fragmenter}.
+ * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing Span
+ * information to a {@link Fragmenter}.
*
* @param token to get {@link WeightedSpanTerm} for
* @return WeightedSpanTerm for token
@@ -199,15 +186,14 @@ public class QueryScorer implements Scorer {
return fieldWeightedSpanTerms.get(token);
}
- /**
- */
+ /** */
private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) {
this.reader = reader;
this.expandMultiTermQuery = expandMultiTermQuery;
this.query = query;
this.field = field;
}
-
+
private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
WeightedSpanTermExtractor qse = newTermExtractor(defaultField);
qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
@@ -215,19 +201,18 @@ public class QueryScorer implements Scorer {
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
qse.setUsePayloads(usePayloads);
if (reader == null) {
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, 1f,
- tokenStream, field);
+ this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, 1f, tokenStream, field);
} else {
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query, 1f,
- tokenStream, field, reader);
+ this.fieldWeightedSpanTerms =
+ qse.getWeightedSpanTermsWithScores(query, 1f, tokenStream, field, reader);
}
- if(qse.isCachedTokenStream()) {
+ if (qse.isCachedTokenStream()) {
return qse.getTokenStream();
}
-
+
return null;
}
-
+
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
return new WeightedSpanTermExtractor(defaultField);
}
@@ -242,18 +227,16 @@ public class QueryScorer implements Scorer {
foundTerms = new HashSet<>();
totalScore = 0;
}
-
- /**
- * @return true if multi-term queries should be expanded
- */
+
+ /** @return true if multi-term queries should be expanded */
public boolean isExpandMultiTermQuery() {
return expandMultiTermQuery;
}
/**
- * Controls whether or not multi-term queries are expanded
- * against a {@link MemoryIndex} {@link IndexReader}.
- *
+ * Controls whether or not multi-term queries are expanded against a {@link MemoryIndex} {@link
+ * IndexReader}.
+ *
* @param expandMultiTermQuery true if multi-term queries should be expanded
*/
public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
@@ -261,8 +244,9 @@ public class QueryScorer implements Scorer {
}
/**
- * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that queries can access them.
- * This does not apply to term vector based TokenStreams, which support payloads only when the term vector has them.
+ * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that
+ * queries can access them. This does not apply to term vector based TokenStreams, which support
+ * payloads only when the term vector has them.
*/
public boolean isUsePayloads() {
return usePayloads;
@@ -273,12 +257,10 @@ public class QueryScorer implements Scorer {
}
/**
- * By default, {@link TokenStream}s that are not of the type
- * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
- * ensure an efficient reset - if you are already using a different caching
- * {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false. Note that term-vector based tokenstreams are detected and won't be
- * wrapped either.
+ * By default, {@link TokenStream}s that are not of the type {@link CachingTokenFilter} are
+ * wrapped in a {@link CachingTokenFilter} to ensure an efficient reset - if you are already using
+ * a different caching {@link TokenStream} impl and you don't want it to be wrapped, set this to
+ * false. Note that term-vector based tokenstreams are detected and won't be wrapped either.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
index 0b5738f..53617b3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;
-
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
@@ -31,17 +30,15 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
/**
- * Utility class used to extract the terms used in a query, plus any weights.
- * This class will not find terms for MultiTermQuery, TermRangeQuery and PrefixQuery classes
- * so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
- * expanded terms.
- *
+ * Utility class used to extract the terms used in a query, plus any weights. This class will not
+ * find terms for MultiTermQuery, TermRangeQuery and PrefixQuery classes so the caller must pass a
+ * rewritten query (see Query.rewrite) to obtain a list of expanded terms.
*/
-public final class QueryTermExtractor
-{
+public final class QueryTermExtractor {
/** for term extraction */
private static final IndexSearcher EMPTY_INDEXSEARCHER;
+
static {
try {
IndexReader emptyReader = new MultiReader();
@@ -55,50 +52,45 @@ public final class QueryTermExtractor
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
- * @param query Query to extract term texts from
+ * @param query Query to extract term texts from
* @return an array of the terms used in a query, plus their weights.
*/
- public static final WeightedTerm[] getTerms(Query query)
- {
- return getTerms(query,false);
+ public static final WeightedTerm[] getTerms(Query query) {
+ return getTerms(query, false);
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
- * @param query Query to extract term texts from
- * @param reader used to compute IDF which can be used to a) score selected fragments better
- * b) use graded highlights eg changing intensity of font color
+ * @param query Query to extract term texts from
+ * @param reader used to compute IDF which can be used to a) score selected fragments better b)
+ * use graded highlights eg changing intensity of font color
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
* @return an array of the terms used in a query, plus their weights.
*/
- public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
- {
- WeightedTerm[] terms=getTerms(query,false, fieldName);
- int totalNumDocs=reader.maxDoc();
- for (int i = 0; i < terms.length; i++)
- {
- try
- {
- int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
- //IDF algorithm taken from ClassicSimilarity class
- float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
- terms[i].weight*=idf;
- }
- catch (IOException e)
- {
- //ignore
- }
- }
+ public static final WeightedTerm[] getIdfWeightedTerms(
+ Query query, IndexReader reader, String fieldName) {
+ WeightedTerm[] terms = getTerms(query, false, fieldName);
+ int totalNumDocs = reader.maxDoc();
+ for (int i = 0; i < terms.length; i++) {
+ try {
+ int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
+ // IDF algorithm taken from ClassicSimilarity class
+ float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
+ terms[i].weight *= idf;
+ } catch (IOException e) {
+ // ignore
+ }
+ }
return terms;
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
- * @param query Query to extract term texts from
+ * @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
- * @param fieldName The fieldName used to filter query terms
+ * @param fieldName The fieldName used to filter query terms
* @return an array of the terms used in a query, plus their weights.
*/
public static WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName) {
@@ -111,13 +103,12 @@ public final class QueryTermExtractor
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
- * @param query Query to extract term texts from
+ * @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
* @return an array of the terms used in a query, plus their weights.
*/
- public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
- {
- return getTerms(query,prohibited,null);
+ public static final WeightedTerm[] getTerms(Query query, boolean prohibited) {
+ return getTerms(query, prohibited, null);
}
private static class BoostedTermExtractor extends QueryVisitor {
@@ -127,8 +118,11 @@ public final class QueryTermExtractor
final boolean includeProhibited;
final Predicate<String> fieldSelector;
- private BoostedTermExtractor(float boost, Set<WeightedTerm> terms, boolean includeProhibited,
- Predicate<String> fieldSelector) {
+ private BoostedTermExtractor(
+ float boost,
+ Set<WeightedTerm> terms,
+ boolean includeProhibited,
+ Predicate<String> fieldSelector) {
this.boost = boost;
this.terms = terms;
this.includeProhibited = includeProhibited;
@@ -150,7 +144,7 @@ public final class QueryTermExtractor
@Override
public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) {
if (parent instanceof BoostQuery) {
- float newboost = boost * ((BoostQuery)parent).getBoost();
+ float newboost = boost * ((BoostQuery) parent).getBoost();
return new BoostedTermExtractor(newboost, terms, includeProhibited, fieldSelector);
}
if (occur == BooleanClause.Occur.MUST_NOT && includeProhibited == false) {
@@ -158,7 +152,5 @@ public final class QueryTermExtractor
}
return this;
}
-
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java
index 229b42c..0d668f1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java
@@ -18,43 +18,40 @@ package org.apache.lucene.search.highlight;
import java.util.HashMap;
import java.util.HashSet;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
/**
- * {@link Scorer} implementation which scores text fragments by the number of
- * unique query terms found. This class uses the {@link QueryTermExtractor}
- * class to process determine the query terms and their boosts to be used.
+ * {@link Scorer} implementation which scores text fragments by the number of unique query terms
+ * found. This class uses the {@link QueryTermExtractor} class to process determine the query terms
+ * and their boosts to be used.
*/
// TODO: provide option to boost score of fragments near beginning of document
// based on fragment.getFragNum()
public class QueryTermScorer implements Scorer {
-
+
TextFragment currentTextFragment = null;
HashSet<String> uniqueTermsInFragment;
float totalScore = 0;
float maxTermWeight = 0;
- private HashMap<String,WeightedTerm> termsToFind;
+ private HashMap<String, WeightedTerm> termsToFind;
private CharTermAttribute termAtt;
/**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite before
- * being passed to this class and the searcher)
+ * @param query a Lucene query (ideally rewritten using query.rewrite before being passed to this
+ * class and the searcher)
*/
public QueryTermScorer(Query query) {
this(QueryTermExtractor.getTerms(query));
}
/**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite before
- * being passed to this class and the searcher)
+ * @param query a Lucene query (ideally rewritten using query.rewrite before being passed to this
+ * class and the searcher)
* @param fieldName the Field name which is used to match Query terms
*/
public QueryTermScorer(Query query, String fieldName) {
@@ -62,14 +59,11 @@ public class QueryTermScorer implements Scorer {
}
/**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite before
- * being passed to this class and the searcher)
- * @param reader used to compute IDF which can be used to a) score selected
- * fragments better b) use graded highlights eg set font color
- * intensity
- * @param fieldName the field on which Inverse Document Frequency (IDF)
- * calculations are based
+ * @param query a Lucene query (ideally rewritten using query.rewrite before being passed to this
+ * class and the searcher)
+ * @param reader used to compute IDF which can be used to a) score selected fragments better b)
+ * use graded highlights eg set font color intensity
+ * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
*/
public QueryTermScorer(Query query, IndexReader reader, String fieldName) {
this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
@@ -78,10 +72,8 @@ public class QueryTermScorer implements Scorer {
public QueryTermScorer(WeightedTerm[] weightedTerms) {
termsToFind = new HashMap<>();
for (int i = 0; i < weightedTerms.length; i++) {
- WeightedTerm existingTerm = termsToFind
- .get(weightedTerms[i].term);
- if ((existingTerm == null)
- || (existingTerm.weight < weightedTerms[i].weight)) {
+ WeightedTerm existingTerm = termsToFind.get(weightedTerms[i].term);
+ if ((existingTerm == null) || (existingTerm.weight < weightedTerms[i].weight)) {
// if a term is defined more than once, always use the highest scoring
// weight
termsToFind.put(weightedTerms[i].term, weightedTerms[i]);
@@ -101,7 +93,7 @@ public class QueryTermScorer implements Scorer {
/*
* (non-Javadoc)
- *
+ *
* @see
* org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
* .lucene.search.highlight.TextFragment)
@@ -111,10 +103,8 @@ public class QueryTermScorer implements Scorer {
uniqueTermsInFragment = new HashSet<>();
currentTextFragment = newFragment;
totalScore = 0;
-
}
-
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
*/
@@ -135,7 +125,6 @@ public class QueryTermScorer implements Scorer {
return queryTerm.getWeight();
}
-
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
*/
@@ -146,7 +135,7 @@ public class QueryTermScorer implements Scorer {
/*
* (non-Javadoc)
- *
+ *
* @see
* org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
*/
@@ -155,9 +144,8 @@ public class QueryTermScorer implements Scorer {
}
/**
- *
- * @return The highest weighted term (useful for passing to GradientFormatter
- * to set top end of coloring scale.
+ * @return The highest weighted term (useful for passing to GradientFormatter to set top end of
+ * coloring scale.
*/
public float getMaxTermWeight() {
return maxTermWeight;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
index fd8f484..26c4c32 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
@@ -17,50 +17,46 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenStream;
/**
- * A Scorer is responsible for scoring a stream of tokens. These token scores
- * can then be used to compute {@link TextFragment} scores.
+ * A Scorer is responsible for scoring a stream of tokens. These token scores can then be used to
+ * compute {@link TextFragment} scores.
*/
public interface Scorer {
/**
- * Called to init the Scorer with a {@link TokenStream}. You can grab references to
- * the attributes you are interested in here and access them from {@link #getTokenScore()}.
- *
+ * Called to init the Scorer with a {@link TokenStream}. You can grab references to the attributes
+ * you are interested in here and access them from {@link #getTokenScore()}.
+ *
* @param tokenStream the {@link TokenStream} that will be scored.
- * @return either a {@link TokenStream} that the Highlighter should continue using (eg
- * if you read the tokenSream in this method) or null to continue
- * using the same {@link TokenStream} that was passed in.
+ * @return either a {@link TokenStream} that the Highlighter should continue using (eg if you read
+ * the tokenSream in this method) or null to continue using the same {@link TokenStream} that
+ * was passed in.
* @throws IOException If there is a low-level I/O error
*/
public TokenStream init(TokenStream tokenStream) throws IOException;
/**
* Called when a new fragment is started for consideration.
- *
+ *
* @param newFragment the fragment that will be scored next
*/
public void startFragment(TextFragment newFragment);
/**
- * Called for each token in the current fragment. The {@link Highlighter} will
- * increment the {@link TokenStream} passed to init on every call.
- *
- * @return a score which is passed to the {@link Highlighter} class to influence the
- * mark-up of the text (this return value is NOT used to score the
- * fragment)
+ * Called for each token in the current fragment. The {@link Highlighter} will increment the
+ * {@link TokenStream} passed to init on every call.
+ *
+ * @return a score which is passed to the {@link Highlighter} class to influence the mark-up of
+ * the text (this return value is NOT used to score the fragment)
*/
public float getTokenScore();
/**
- * Called when the {@link Highlighter} has no more tokens for the current fragment -
- * the Scorer returns the weighting it has derived for the most recent
- * fragment, typically based on the results of {@link #getTokenScore()}.
- *
+ * Called when the {@link Highlighter} has no more tokens for the current fragment - the Scorer
+ * returns the weighting it has derived for the most recent fragment, typically based on the
+ * results of {@link #getTokenScore()}.
*/
public float getFragmentScore();
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
index 4f25c3a..c504918 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
@@ -20,8 +20,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
- * {@link Fragmenter} implementation which breaks text up into same-size
- * fragments with no concerns over spotting sentence boundaries.
+ * {@link Fragmenter} implementation which breaks text up into same-size fragments with no concerns
+ * over spotting sentence boundaries.
*/
public class SimpleFragmenter implements Fragmenter {
private static final int DEFAULT_FRAGMENT_SIZE = 100;
@@ -33,15 +33,11 @@ public class SimpleFragmenter implements Fragmenter {
this(DEFAULT_FRAGMENT_SIZE);
}
- /**
- *
- * @param fragmentSize size in number of characters of each fragment
- */
+ /** @param fragmentSize size in number of characters of each fragment */
public SimpleFragmenter(int fragmentSize) {
this.fragmentSize = fragmentSize;
}
-
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
*/
@@ -51,7 +47,6 @@ public class SimpleFragmenter implements Fragmenter {
currentNumFrags = 1;
}
-
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
*/
@@ -64,18 +59,13 @@ public class SimpleFragmenter implements Fragmenter {
return isNewFrag;
}
- /**
- * @return size in number of characters of each fragment
- */
+ /** @return size in number of characters of each fragment */
public int getFragmentSize() {
return fragmentSize;
}
- /**
- * @param size size in characters of each fragment
- */
+ /** @param size size in characters of each fragment */
public void setFragmentSize(int size) {
fragmentSize = size;
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
index 7e7630d..393d895 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
@@ -16,60 +16,50 @@
*/
package org.apache.lucene.search.highlight;
-/**
- * Simple {@link Encoder} implementation to escape text for HTML output
- *
- */
+/** Simple {@link Encoder} implementation to escape text for HTML output */
public class SimpleHTMLEncoder implements Encoder {
- public SimpleHTMLEncoder() {
- }
+ public SimpleHTMLEncoder() {}
@Override
- public String encodeText(String originalText)
- {
+ public String encodeText(String originalText) {
return htmlEncode(originalText);
}
- /**
- * Encode string into HTML
- */
- public final static String htmlEncode(String plainText)
- {
- if (plainText == null || plainText.length() == 0)
- {
+ /** Encode string into HTML */
+ public static final String htmlEncode(String plainText) {
+ if (plainText == null || plainText.length() == 0) {
return "";
}
StringBuilder result = new StringBuilder(plainText.length());
- for (int index=0; index<plainText.length(); index++)
- {
+ for (int index = 0; index < plainText.length(); index++) {
char ch = plainText.charAt(index);
switch (ch) {
- case '"':
- result.append(""");
- break;
- case '&':
- result.append("&");
- break;
- case '<':
- result.append("<");
- break;
- case '>':
- result.append(">");
- break;
- case '\'':
- result.append("'");
- break;
- case '/':
- result.append("/");
- break;
- default:
- result.append(ch);
+ case '"':
+ result.append(""");
+ break;
+ case '&':
+ result.append("&");
+ break;
+ case '<':
+ result.append("<");
+ break;
+ case '>':
+ result.append(">");
+ break;
+ case '\'':
+ result.append("'");
+ break;
+ case '/':
+ result.append("/");
+ break;
+ default:
+ result.append(ch);
}
}
return result.toString();
}
-}
\ No newline at end of file
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
index fec3971..b129890 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
@@ -16,15 +16,12 @@
*/
package org.apache.lucene.search.highlight;
-/**
- * Simple {@link Formatter} implementation to highlight terms with a pre and
- * post tag.
- */
+/** Simple {@link Formatter} implementation to highlight terms with a pre and post tag. */
public class SimpleHTMLFormatter implements Formatter {
-
+
private static final String DEFAULT_PRE_TAG = "<B>";
private static final String DEFAULT_POST_TAG = "</B>";
-
+
private String preTag;
private String postTag;
@@ -49,11 +46,11 @@ public class SimpleHTMLFormatter implements Formatter {
// Allocate StringBuilder with the right number of characters from the
// beginning, to avoid char[] allocations in the middle of appends.
- StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
+ StringBuilder returnBuffer =
+ new StringBuilder(preTag.length() + originalText.length() + postTag.length());
returnBuffer.append(preTag);
returnBuffer.append(originalText);
returnBuffer.append(postTag);
return returnBuffer.toString();
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
index 7b708d8..735f39c 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
@@ -16,19 +16,16 @@
*/
package org.apache.lucene.search.highlight;
-
import java.util.List;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.spans.Spans;
-
/**
- * {@link Fragmenter} implementation which breaks text up into same-size
- * fragments but does not split up {@link Spans}. This is a simple sample class.
+ * {@link Fragmenter} implementation which breaks text up into same-size fragments but does not
+ * split up {@link Spans}. This is a simple sample class.
*/
public class SimpleSpanFragmenter implements Fragmenter {
private static final int DEFAULT_FRAGMENT_SIZE = 100;
@@ -42,9 +39,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
private PositionIncrementAttribute posIncAtt;
private OffsetAttribute offsetAtt;
- /**
- * @param queryScorer QueryScorer that was used to score hits
- */
+ /** @param queryScorer QueryScorer that was used to score hits */
public SimpleSpanFragmenter(QueryScorer queryScorer) {
this(queryScorer, DEFAULT_FRAGMENT_SIZE);
}
@@ -57,7 +52,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
this.fragmentSize = fragmentSize;
this.queryScorer = queryScorer;
}
-
+
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
*/
@@ -84,9 +79,10 @@ public class SimpleSpanFragmenter implements Fragmenter {
}
}
- boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags)
- && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1);
-
+ boolean isNewFrag =
+ offsetAtt.endOffset() >= (fragmentSize * currentNumFrags)
+ && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1);
+
if (isNewFrag) {
currentNumFrags++;
}
@@ -94,7 +90,6 @@ public class SimpleSpanFragmenter implements Fragmenter {
return isNewFrag;
}
-
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
*/
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java
index e108814..0871f5e 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java
@@ -16,28 +16,25 @@
*/
package org.apache.lucene.search.highlight;
/**
- * Formats text with different color intensity depending on the score of the
- * term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which
- * doesn't work in Mozilla, thus this class.
+ * Formats text with different color intensity depending on the score of the term using the span
+ * tag. GradientFormatter uses a bgcolor argument to the font tag which doesn't work in Mozilla,
+ * thus this class.
*
* @see GradientFormatter
*/
-
-public class SpanGradientFormatter
- extends GradientFormatter {
- public SpanGradientFormatter(float maxScore, String minForegroundColor,
- String maxForegroundColor, String minBackgroundColor,
- String maxBackgroundColor) {
- super(maxScore, minForegroundColor,
- maxForegroundColor, minBackgroundColor,
- maxBackgroundColor);
+public class SpanGradientFormatter extends GradientFormatter {
+ public SpanGradientFormatter(
+ float maxScore,
+ String minForegroundColor,
+ String maxForegroundColor,
+ String minBackgroundColor,
+ String maxBackgroundColor) {
+ super(maxScore, minForegroundColor, maxForegroundColor, minBackgroundColor, maxBackgroundColor);
}
-
@Override
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
- if (tokenGroup.getTotalScore() == 0)
- return originalText;
+ if (tokenGroup.getTotalScore() == 0) return originalText;
float score = tokenGroup.getTotalScore();
if (score == 0) {
return originalText;
@@ -63,7 +60,9 @@ public class SpanGradientFormatter
return sb.toString();
}
- // guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
- private static final String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
+ // guess how much extra text we'll add to the text we're highlighting to try to avoid a
+ // StringBuilder resize
+ private static final String TEMPLATE =
+ "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
private static final int EXTRA = TEMPLATE.length();
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
index 0b8cb90..0b782cf 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
@@ -19,7 +19,6 @@ package org.apache.lucene.search.highlight;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
-
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
@@ -50,25 +49,26 @@ public class TermVectorLeafReader extends LeafReader {
private final FieldInfos fieldInfos;
public TermVectorLeafReader(String field, Terms terms) {
- fields = new Fields() {
- @Override
- public Iterator<String> iterator() {
- return Collections.singletonList(field).iterator();
- }
-
- @Override
- public Terms terms(String fld) throws IOException {
- if (!field.equals(fld)) {
- return null;
- }
- return terms;
- }
-
- @Override
- public int size() {
- return 1;
- }
- };
+ fields =
+ new Fields() {
+ @Override
+ public Iterator<String> iterator() {
+ return Collections.singletonList(field).iterator();
+ }
+
+ @Override
+ public Terms terms(String fld) throws IOException {
+ if (!field.equals(fld)) {
+ return null;
+ }
+ return terms;
+ }
+
+ @Override
+ public int size() {
+ return 1;
+ }
+ };
IndexOptions indexOptions;
if (!terms.hasFreqs()) {
@@ -80,15 +80,28 @@ public class TermVectorLeafReader extends LeafReader {
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
}
- FieldInfo fieldInfo = new FieldInfo(field, 0,
- true, true, terms.hasPayloads(),
- indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, 0, VectorValues.SearchStrategy.NONE, false);
- fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
- }
-
- @Override
- protected void doClose() throws IOException {
- }
+ FieldInfo fieldInfo =
+ new FieldInfo(
+ field,
+ 0,
+ true,
+ true,
+ terms.hasPayloads(),
+ indexOptions,
+ DocValuesType.NONE,
+ -1,
+ Collections.emptyMap(),
+ 0,
+ 0,
+ 0,
+ 0,
+ VectorValues.SearchStrategy.NONE,
+ false);
+ fieldInfos = new FieldInfos(new FieldInfo[] {fieldInfo});
+ }
+
+ @Override
+ protected void doClose() throws IOException {}
@Override
public Terms terms(String field) throws IOException {
@@ -122,7 +135,7 @@ public class TermVectorLeafReader extends LeafReader {
@Override
public NumericDocValues getNormValues(String field) throws IOException {
- return null;//Is this needed? See MemoryIndex for a way to do it.
+ return null; // Is this needed? See MemoryIndex for a way to do it.
}
@Override
@@ -146,8 +159,7 @@ public class TermVectorLeafReader extends LeafReader {
}
@Override
- public void checkIntegrity() throws IOException {
- }
+ public void checkIntegrity() throws IOException {}
@Override
public Fields getTermVectors(int docID) throws IOException {
@@ -168,8 +180,7 @@ public class TermVectorLeafReader extends LeafReader {
}
@Override
- public void document(int docID, StoredFieldVisitor visitor) throws IOException {
- }
+ public void document(int docID, StoredFieldVisitor visitor) throws IOException {}
@Override
public LeafMetaData getMetaData() {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
index d71954f..3ae4e9f 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
@@ -15,56 +15,39 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
-/**
- * Low-level class used to record information about a section of a document
- * with a score.
- *
- *
- */
-public class TextFragment
-{
+/** Low-level class used to record information about a section of a document with a score. */
+public class TextFragment {
CharSequence markedUpText;
int fragNum;
int textStartPos;
int textEndPos;
float score;
- public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
- {
- this.markedUpText=markedUpText;
+ public TextFragment(CharSequence markedUpText, int textStartPos, int fragNum) {
+ this.markedUpText = markedUpText;
this.textStartPos = textStartPos;
this.fragNum = fragNum;
}
- void setScore(float score)
- {
- this.score=score;
+ void setScore(float score) {
+ this.score = score;
}
- public float getScore()
- {
+
+ public float getScore() {
return score;
}
- /**
- * @param frag2 Fragment to be merged into this one
- */
- public void merge(TextFragment frag2)
- {
+ /** @param frag2 Fragment to be merged into this one */
+ public void merge(TextFragment frag2) {
textEndPos = frag2.textEndPos;
- score=Math.max(score,frag2.score);
+ score = Math.max(score, frag2.score);
}
- /**
- * @return true if this fragment follows the one passed
- */
- public boolean follows(TextFragment fragment)
- {
+ /** @return true if this fragment follows the one passed */
+ public boolean follows(TextFragment fragment) {
return textStartPos == fragment.textEndPos;
}
- /**
- * @return the fragment sequence number
- */
- public int getFragNum()
- {
+ /** @return the fragment sequence number */
+ public int getFragNum() {
return fragNum;
}
@@ -74,5 +57,4 @@ public class TextFragment
public String toString() {
return markedUpText.subSequence(textStartPos, textEndPos).toString();
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
index ebb37d7..eb903fc 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
@@ -21,8 +21,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
- * One, or several overlapping tokens, along with the score(s) and the scope of
- * the original text.
+ * One, or several overlapping tokens, along with the score(s) and the scope of the original text.
*/
public class TokenGroup {
@@ -82,7 +81,6 @@ public class TokenGroup {
}
/**
- *
* @param index a value between 0 and numTokens -1
* @return the "n"th score
*/
@@ -91,33 +89,28 @@ public class TokenGroup {
}
/**
- * @return the earliest start offset in the original text of a matching token in this group (score > 0), or
- * if there are none then the earliest offset of any token in the group.
+ * @return the earliest start offset in the original text of a matching token in this group (score
+ * > 0), or if there are none then the earliest offset of any token in the group.
*/
public int getStartOffset() {
return matchStartOffset;
}
/**
- * @return the latest end offset in the original text of a matching token in this group (score > 0), or
- * if there are none then {@link #getEndOffset()}.
+ * @return the latest end offset in the original text of a matching token in this group (score
+ * > 0), or if there are none then {@link #getEndOffset()}.
*/
public int getEndOffset() {
return matchEndOffset;
}
- /**
- * @return the number of tokens in this group
- */
+ /** @return the number of tokens in this group */
public int getNumTokens() {
return numTokens;
}
- /**
- * @return all tokens' scores summed up
- */
+ /** @return all tokens' scores summed up */
public float getTotalScore() {
return tot;
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
index 0c0a63f..a3d8dbb 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@@ -1,7 +1,4 @@
/*
- * Created on 28-Oct-2004
- */
-/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -20,7 +17,6 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
@@ -29,8 +25,9 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
/**
- * Convenience methods for obtaining a {@link TokenStream} for use with the {@link Highlighter} - can obtain from
- * term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
+ * Convenience methods for obtaining a {@link TokenStream} for use with the {@link Highlighter} -
+ * can obtain from term vectors with offsets and positions or from an Analyzer re-parsing the stored
+ * content.
*
* @see TokenStreamFromTermVector
*/
@@ -39,23 +36,26 @@ public class TokenSources {
private TokenSources() {}
/**
- * Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.
+ * Get a token stream from either un-inverting a term vector if possible, or by analyzing the
+ * text.
*
- * WARNING: Don't call this if there is more than one value for this field. If there are, and if there are term
- * vectors, then there is a single tokenstream with offsets suggesting all the field values were concatenated.
+ * <p>WARNING: Don't call this if there is more than one value for this field. If there are, and
+ * if there are term vectors, then there is a single tokenstream with offsets suggesting all the
+ * field values were concatenated.
*
* @param field The field to either get term vectors from or to analyze the text from.
- * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
- * be re-used for the same document (e.g. when highlighting multiple fields).
+ * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance,
+ * this instance should be re-used for the same document (e.g. when highlighting multiple
+ * fields).
* @param text the text to analyze, failing term vector un-inversion
* @param analyzer the analyzer to analyze {@code text} with, failing term vector un-inversion
- * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
- * Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
- *
+ * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no
+ * limit. Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
* @return a token stream from either term vectors, or from analyzing the text. Never null.
*/
- public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
- int maxStartOffset) throws IOException {
+ public static TokenStream getTokenStream(
+ String field, Fields tvFields, String text, Analyzer analyzer, int maxStartOffset)
+ throws IOException {
TokenStream tokenStream = getTermVectorTokenStreamOrNull(field, tvFields, maxStartOffset);
if (tokenStream != null) {
return tokenStream;
@@ -68,19 +68,20 @@ public class TokenSources {
}
/**
- * Get a token stream by un-inverting the term vector. This method returns null if {@code tvFields} is null
- * or if the field has no term vector, or if the term vector doesn't have offsets. Positions are recommended on the
- * term vector but it isn't strictly required.
+ * Get a token stream by un-inverting the term vector. This method returns null if {@code
+ * tvFields} is null or if the field has no term vector, or if the term vector doesn't have
+ * offsets. Positions are recommended on the term vector but it isn't strictly required.
*
* @param field The field to get term vectors from.
- * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
- * be re-used for the same document (e.g. when highlighting multiple fields).
- * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
- * Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1
+ * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance,
+ * this instance should be re-used for the same document (e.g. when highlighting multiple
+ * fields).
+ * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no
+ * limit. Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1
* @return a token stream from term vectors. Null if no term vectors with the right options.
*/
- public static TokenStream getTermVectorTokenStreamOrNull(String field, Fields tvFields, int maxStartOffset)
- throws IOException {
+ public static TokenStream getTermVectorTokenStreamOrNull(
+ String field, Fields tvFields, int maxStartOffset) throws IOException {
if (tvFields == null) {
return null;
}
@@ -93,26 +94,24 @@ public class TokenSources {
/**
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
- * specified docId, then, falls back to using the passed in
- * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
- * This is useful when you already have the document, but would prefer to use
- * the vector first.
+ * specified docId, then, falls back to using the passed in {@link
+ * org.apache.lucene.document.Document} to retrieve the TokenStream. This is useful when you
+ * already have the document, but would prefer to use the vector first.
*
- * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
- * and get the vector from
+ * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try and get the vector
+ * from
* @param docId The docId to retrieve.
* @param field The field to retrieve on the document
* @param document The document to fall back on
- * @param analyzer The analyzer to use for creating the TokenStream if the
- * vector doesn't exist
- * @return The {@link org.apache.lucene.analysis.TokenStream} for the
- * {@link org.apache.lucene.index.IndexableField} on the
- * {@link org.apache.lucene.document.Document}
+ * @param analyzer The analyzer to use for creating the TokenStream if the vector doesn't exist
+ * @return The {@link org.apache.lucene.analysis.TokenStream} for the {@link
+ * org.apache.lucene.index.IndexableField} on the {@link org.apache.lucene.document.Document}
* @throws IOException if there was an error loading
*/
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, Document document, Analyzer analyzer) throws IOException {
+ public static TokenStream getAnyTokenStream(
+ IndexReader reader, int docId, String field, Document document, Analyzer analyzer)
+ throws IOException {
TokenStream ts = null;
Fields vectors = reader.getTermVectors(docId);
@@ -131,17 +130,16 @@ public class TokenSources {
}
/**
- * A convenience method that tries a number of approaches to getting a token
- * stream. The cost of finding there are no termVectors in the index is
- * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
- * approach to coding is probably acceptable
- *
+ * A convenience method that tries a number of approaches to getting a token stream. The cost of
+ * finding there are no termVectors in the index is minimal (1000 invocations still registers 0
+ * ms). So this "lazy" (flexible?) approach to coding is probably acceptable
+ *
* @return null if field not stored correctly
* @throws IOException If there is a low-level I/O error
*/
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, Analyzer analyzer) throws IOException {
+ public static TokenStream getAnyTokenStream(
+ IndexReader reader, int docId, String field, Analyzer analyzer) throws IOException {
TokenStream ts = null;
Fields vectors = reader.getTermVectors(docId);
@@ -161,18 +159,18 @@ public class TokenSources {
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getTokenStream(Terms vector,
- boolean tokenPositionsGuaranteedContiguous) throws IOException {
+ public static TokenStream getTokenStream(Terms vector, boolean tokenPositionsGuaranteedContiguous)
+ throws IOException {
return getTokenStream(vector);
}
/**
- * Returns a token stream generated from a {@link Terms}. This
- * can be used to feed the highlighter with a pre-parsed token
- * stream. The {@link Terms} must have offsets available. If there are no positions available,
- * all tokens will have position increments reflecting adjacent tokens, or coincident when terms
- * share a start offset. If there are stopwords filtered from the index, you probably want to ensure
- * term vectors have positions so that phrase queries won't match across stopwords.
+ * Returns a token stream generated from a {@link Terms}. This can be used to feed the highlighter
+ * with a pre-parsed token stream. The {@link Terms} must have offsets available. If there are no
+ * positions available, all tokens will have position increments reflecting adjacent tokens, or
+ * coincident when terms share a start offset. If there are stopwords filtered from the index, you
+ * probably want to ensure term vectors have positions so that phrase queries won't match across
+ * stopwords.
*
* @throws IllegalArgumentException if no offsets are available
*/
@@ -181,7 +179,7 @@ public class TokenSources {
if (!tpv.hasOffsets()) {
throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
- //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
+ // TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
// highlighters require offsets, so we insist here.
}
@@ -189,22 +187,21 @@ public class TokenSources {
}
/**
- * Returns a {@link TokenStream} with positions and offsets constructed from
- * field termvectors. If the field has no termvectors or offsets
- * are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)}
- * for an explanation of what happens when positions aren't present.
+ * Returns a {@link TokenStream} with positions and offsets constructed from field termvectors. If
+ * the field has no termvectors or offsets are not included in the termvector, return null. See
+ * {@link #getTokenStream(org.apache.lucene.index.Terms)} for an explanation of what happens when
+ * positions aren't present.
*
* @param reader the {@link IndexReader} to retrieve term vectors from
* @param docId the document to retrieve termvectors for
* @param field the field to retrieve termvectors for
* @return a {@link TokenStream}, or null if offsets are not available
* @throws IOException If there is a low-level I/O error
- *
* @see #getTokenStream(org.apache.lucene.index.Terms)
*/
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
- String field) throws IOException {
+ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, String field)
+ throws IOException {
Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
@@ -219,32 +216,29 @@ public class TokenSources {
if (!vector.hasOffsets()) {
return null;
}
-
+
return getTokenStream(vector);
}
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getTokenStream(IndexReader reader, int docId,
- String field, Analyzer analyzer) throws IOException {
+ public static TokenStream getTokenStream(
+ IndexReader reader, int docId, String field, Analyzer analyzer) throws IOException {
Document doc = reader.document(docId);
return getTokenStream(doc, field, analyzer);
}
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getTokenStream(Document doc, String field,
- Analyzer analyzer) {
+ public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer) {
String contents = doc.get(field);
if (contents == null) {
- throw new IllegalArgumentException("Field " + field
- + " in document is not stored and cannot be analyzed");
+ throw new IllegalArgumentException(
+ "Field " + field + " in document is not stored and cannot be analyzed");
}
return getTokenStream(field, contents, analyzer);
}
@Deprecated // maintenance reasons LUCENE-6445
- public static TokenStream getTokenStream(String field, String contents,
- Analyzer analyzer) {
+ public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer) {
return analyzer.tokenStream(field, contents);
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
index 6353930..7d4932c 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
@@ -17,7 +17,6 @@
package org.apache.lucene.search.highlight;
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -34,17 +33,19 @@ import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
/**
- * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
- * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
- * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
- * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
- * no need to wrap with a caching impl.
- * <p>
- * The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps
- * in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds
- * them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides
- * the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
- * the same final position; otherwise tokens become adjacent.
+ * TokenStream created from a term vector field. The term vector requires positions and/or offsets
+ * (either). If you want payloads add PayloadAttributeImpl (as you would normally) but don't assume
+ * the attribute is already added just because you know the term vector has payloads, since the
+ * first call to incrementToken() will observe if you asked for them and if not then won't get them.
+ * This TokenStream supports an efficient {@link #reset()}, so there's no need to wrap with a
+ * caching impl.
+ *
+ * <p>The implementation will create an array of tokens indexed by token position. As long as there
+ * aren't massive jumps in positions, this is fine. And it assumes there aren't large numbers of
+ * tokens at the same position, since it adds them to a linked-list per position in O(N^2)
+ * complexity. When there aren't positions in the term vector, it divides the startOffset by 8 to
+ * use as a temporary substitute. In that case, tokens with the same startOffset will occupy the
+ * same final position; otherwise tokens become adjacent.
*
* @lucene.internal
*/
@@ -58,28 +59,29 @@ public final class TokenStreamFromTermVector extends TokenStream {
private final int maxStartOffset;
- private OffsetAttribute offsetAttribute;//maybe null
+ private OffsetAttribute offsetAttribute; // maybe null
- private PayloadAttribute payloadAttribute;//maybe null
+ private PayloadAttribute payloadAttribute; // maybe null
- private CharsRefBuilder termCharsBuilder;//term data here
+ private CharsRefBuilder termCharsBuilder; // term data here
- private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
- private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
+ private BytesRefArray payloadsBytesRefArray; // only used when payloadAttribute is non-null
+ private BytesRefBuilder spareBytesRefBuilder; // only used when payloadAttribute is non-null
private TokenLL firstToken = null; // the head of a linked-list
private TokenLL incrementToken = null;
- private boolean initialized = false;//lazy
+ private boolean initialized = false; // lazy
/**
- * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
- * {@link #incrementToken}.
+ * Constructor. The uninversion doesn't happen here; it's delayed till the first call to {@link
+ * #incrementToken}.
*
- * @param vector Terms that contains the data for
- * creating the TokenStream. Must have positions and/or offsets.
- * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
+ * @param vector Terms that contains the data for creating the TokenStream. Must have positions
+ * and/or offsets.
+ * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1
+ * disables the limit.
*/
public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
@@ -93,7 +95,9 @@ public final class TokenStreamFromTermVector extends TokenStream {
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
}
- public Terms getTermVectorTerms() { return vector; }
+ public Terms getTermVectorTerms() {
+ return vector;
+ }
@Override
public void reset() throws IOException {
@@ -101,7 +105,8 @@ public final class TokenStreamFromTermVector extends TokenStream {
super.reset();
}
- //We delay initialization because we can see which attributes the consumer wants, particularly payloads
+ // We delay initialization because we can see which attributes the consumer wants, particularly
+ // payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
@@ -110,7 +115,7 @@ public final class TokenStreamFromTermVector extends TokenStream {
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
- dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
+ dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS); // must ask for offsets too
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
@@ -118,7 +123,7 @@ public final class TokenStreamFromTermVector extends TokenStream {
// We put term data here
termCharsBuilder = new CharsRefBuilder();
- termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
+ termCharsBuilder.grow((int) (vector.size() * 7)); // 7 is over-estimate of average term len
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
@@ -129,11 +134,12 @@ public final class TokenStreamFromTermVector extends TokenStream {
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
- CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
- //int sumFreq = 0;
+ CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder(); // only for UTF8->UTF16 call
+ // int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
- //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
- // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
+ // Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
+ // note: if term vectors supported seek by ord then we might just keep an int and seek by ord
+ // on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
@@ -143,7 +149,7 @@ public final class TokenStreamFromTermVector extends TokenStream {
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
- //sumFreq += freq;
+ // sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
@@ -152,11 +158,12 @@ public final class TokenStreamFromTermVector extends TokenStream {
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
- continue;//filter this token out; exceeds threshold
+ continue; // filter this token out; exceeds threshold
}
- token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
+ token.endOffsetInc =
+ (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
- pos = token.startOffset >> 3;//divide by 8
+ pos = token.startOffset >> 3; // divide by 8
}
}
@@ -165,10 +172,10 @@ public final class TokenStreamFromTermVector extends TokenStream {
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
- //Add token to an array indexed by position
+ // Add token to an array indexed by position
if (positionedTokens.length <= pos) {
- //grow, but not 2x since we think our original length estimate is close
- TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)];
+ // grow, but not 2x since we think our original length estimate is close
+ TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
@@ -178,10 +185,12 @@ public final class TokenStreamFromTermVector extends TokenStream {
}
}
-// System.out.println(String.format(
-// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
-// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
-// (originalPositionEstimate/(lastPosition + 1.0f))));
+ // System.out.println(String.format(
+ // "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f
+ // WastePct: %3.3f",
+ // sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition,
+ // ((float)lastPosition)/sumFreq,
+ // (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
@@ -192,15 +201,15 @@ public final class TokenStreamFromTermVector extends TokenStream {
if (token == null) {
continue;
}
- //link
+ // link
if (prevToken != null) {
assert prevToken.next == null;
- prevToken.next = token; //concatenate linked-list
+ prevToken.next = token; // concatenate linked-list
} else {
assert firstToken == null;
firstToken = token;
}
- //set increments
+ // set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
@@ -227,18 +236,21 @@ public final class TokenStreamFromTermVector extends TokenStream {
}
private TokenLL[] initTokensArray() throws IOException {
- // Estimate the number of position slots we need from term stats. We use some estimation factors taken from
+ // Estimate the number of position slots we need from term stats. We use some estimation
+ // factors taken from
// Wikipedia that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
assert sumTotalTermFreq != -1;
- final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
+ final int originalPositionEstimate =
+ (int) (sumTotalTermFreq * 1.5); // less than 1 in 10 docs exceed this
// This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
// Take the smaller of the two estimates, but no smaller than 64
- return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
+ return new TokenLL
+ [Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
}
@Override
@@ -258,16 +270,19 @@ public final class TokenStreamFromTermVector extends TokenStream {
return false;
}
clearAttributes();
- termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
+ termAttribute.copyBuffer(
+ termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
+ offsetAttribute.setOffset(
+ incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null) {
if (incrementToken.payloadIndex == -1) {
payloadAttribute.setPayload(null);
} else {
- payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
+ payloadAttribute.setPayload(
+ payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
}
}
return true;
@@ -286,9 +301,10 @@ public final class TokenStreamFromTermVector extends TokenStream {
TokenLL next;
- /** Given the head of a linked-list (possibly null) this inserts the token at the correct
- * spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
- * O(N^2) complexity but N should be a handful at most.
+ /**
+ * Given the head of a linked-list (possibly null) this inserts the token at the correct spot to
+ * maintain the desired order, and returns the head (which could be this token if it's the
+ * smallest). O(N^2) complexity but N should be a handful at most.
*/
TokenLL insertIntoSortedLinkedList(final TokenLL head) {
assert next == null;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
index ee9cf3a..ab27318 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
@@ -16,17 +16,12 @@
*/
package org.apache.lucene.search.highlight;
-
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
-
-/**
- * Lightweight class to hold term, weight, and positions used for scoring this
- * term.
- */
-public class WeightedSpanTerm extends WeightedTerm{
+/** Lightweight class to hold term, weight, and positions used for scoring this term. */
+public class WeightedSpanTerm extends WeightedTerm {
boolean positionSensitive;
private List<PositionSpan> positionSpans = new ArrayList<>();
@@ -43,8 +38,7 @@ public class WeightedSpanTerm extends WeightedTerm{
/**
* Checks to see if this term is valid at <code>position</code>.
*
- * @param position
- * to check against valid term positions
+ * @param position to check against valid term positions
* @return true iff this term is a hit at this position
*/
public boolean checkPosition(int position) {
@@ -80,7 +74,4 @@ public class WeightedSpanTerm extends WeightedTerm{
public List<PositionSpan> getPositionSpans() {
return positionSpans;
}
-
}
-
-
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
index ee0a187..9f75d65 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@@ -24,7 +24,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
-
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.BinaryDocValues;
@@ -68,12 +67,13 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
/**
- * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether
- * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
+ * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether {@link
+ * Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
+ *
+ * <p>In order to support additional, by default unsupported queries, subclasses can override {@link
+ * #extract(Query, float, Map)} for extracting wrapped or delegate queries and {@link
+ * #extractUnknownQuery(Query, Map)} to process custom leaf queries:
*
- * In order to support additional, by default unsupported queries, subclasses can override
- * {@link #extract(Query, float, Map)} for extracting wrapped or delegate queries and
- * {@link #extractUnknownQuery(Query, Map)} to process custom leaf queries:
* <pre>
* <code>
* WeightedSpanTermExtractor extractor = new WeightedSpanTermExtractor() {
@@ -99,7 +99,7 @@ import org.apache.lucene.util.IOUtils;
public class WeightedSpanTermExtractor {
private String fieldName;
- private TokenStream tokenStream;//set subsequent to getWeightedSpanTerms* methods
+ private TokenStream tokenStream; // set subsequent to getWeightedSpanTerms* methods
private String defaultField;
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
@@ -117,15 +117,15 @@ public class WeightedSpanTermExtractor {
}
/**
- * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied <code>Query</code>.
- *
- * @param query
- * Query to extract Terms from
- * @param terms
- * Map to place created WeightedSpanTerms in
+ * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied
+ * <code>Query</code>.
+ *
+ * @param query Query to extract Terms from
+ * @param terms Map to place created WeightedSpanTerms in
* @throws IOException If there is a low-level I/O error
*/
- protected void extract(Query query, float boost, Map<String,WeightedSpanTerm> terms) throws IOException {
+ protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms)
+ throws IOException {
if (query instanceof BoostQuery) {
BoostQuery boostQuery = (BoostQuery) query;
extract(boostQuery.getQuery(), boost * boostQuery.getBoost(), terms);
@@ -151,13 +151,15 @@ public class WeightedSpanTermExtractor {
int[] positions = phraseQuery.getPositions();
if (positions.length >= 2) {
// positions are in increasing order. max(0,...) is just a safeguard.
- positionGaps = Math.max(0, positions[positions.length - 1] - positions[0] - positions.length + 1);
+ positionGaps =
+ Math.max(0, positions[positions.length - 1] - positions[0] - positions.length + 1);
}
- //if original slop is 0 then require inOrder
+ // if original slop is 0 then require inOrder
boolean inorder = (phraseQuery.getSlop() == 0);
- SpanNearQuery sp = new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder);
+ SpanNearQuery sp =
+ new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder);
extractWeightedSpanTerms(terms, sp, boost);
}
} else if (query instanceof TermQuery || query instanceof SynonymQuery) {
@@ -170,7 +172,7 @@ public class WeightedSpanTermExtractor {
extract(q, boost, terms);
}
} else if (query instanceof CommonTermsQuery) {
- // specialized since rewriting would change the result query
+ // specialized since rewriting would change the result query
// this query is index sensitive.
extractWeightedTerms(terms, query, boost);
} else if (query instanceof DisjunctionMaxQuery) {
@@ -190,7 +192,7 @@ public class WeightedSpanTermExtractor {
}
}
- @SuppressWarnings({"unchecked","rawtypes"})
+ @SuppressWarnings({"unchecked", "rawtypes"})
final List<SpanQuery>[] disjunctLists = new List[maxPosition + 1];
int distinctPositions = 0;
@@ -211,8 +213,8 @@ public class WeightedSpanTermExtractor {
final SpanQuery[] clauses = new SpanQuery[distinctPositions];
for (List<SpanQuery> disjuncts : disjunctLists) {
if (disjuncts != null) {
- clauses[position++] = new SpanOrQuery(disjuncts
- .toArray(new SpanQuery[disjuncts.size()]));
+ clauses[position++] =
+ new SpanOrQuery(disjuncts.toArray(new SpanQuery[disjuncts.size()]));
} else {
++positionGaps;
}
@@ -229,14 +231,14 @@ public class WeightedSpanTermExtractor {
}
}
} else if (query instanceof MatchAllDocsQuery) {
- //nothing
+ // nothing
} else if (query instanceof FunctionScoreQuery) {
extract(((FunctionScoreQuery) query).getWrappedQuery(), boost, terms);
} else if (isQueryUnsupported(query.getClass())) {
// nothing
} else {
- if (query instanceof MultiTermQuery &&
- (!expandMultiTermQuery || !fieldNameComparator(((MultiTermQuery)query).getField()))) {
+ if (query instanceof MultiTermQuery
+ && (!expandMultiTermQuery || !fieldNameComparator(((MultiTermQuery) query).getField()))) {
return;
}
Query origQuery = query;
@@ -248,7 +250,8 @@ public class WeightedSpanTermExtractor {
rewritten = origQuery.rewrite(reader);
}
if (rewritten != origQuery) {
- // only rewrite once and then flatten again - the rewritten query could have a special treatment
+ // only rewrite once and then flatten again - the rewritten query could have a special
+ // treatment
// if this method is overwritten in a subclass or above in the next recursion
extract(rewritten, boost, terms);
} else {
@@ -269,22 +272,22 @@ public class WeightedSpanTermExtractor {
return false;
}
- protected void extractUnknownQuery(Query query,
- Map<String, WeightedSpanTerm> terms) throws IOException {
-
+ protected void extractUnknownQuery(Query query, Map<String, WeightedSpanTerm> terms)
+ throws IOException {
+
// for sub-classing to extract custom queries
}
/**
- * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied <code>SpanQuery</code>.
- *
- * @param terms
- * Map to place created WeightedSpanTerms in
- * @param spanQuery
- * SpanQuery to extract Terms from
+ * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied
+ * <code>SpanQuery</code>.
+ *
+ * @param terms Map to place created WeightedSpanTerms in
+ * @param spanQuery SpanQuery to extract Terms from
* @throws IOException If there is a low-level I/O error
*/
- protected void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery, float boost) throws IOException {
+ protected void extractWeightedSpanTerms(
+ Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery, float boost) throws IOException {
Set<String> fieldNames;
if (fieldName == null) {
@@ -298,9 +301,9 @@ public class WeightedSpanTermExtractor {
if (defaultField != null) {
fieldNames.add(defaultField);
}
-
+
Map<String, SpanQuery> queries = new HashMap<>();
-
+
Set<Term> nonWeightedTerms = new HashSet<>();
final boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
final IndexSearcher searcher = new IndexSearcher(getLeafContext());
@@ -325,7 +328,8 @@ public class WeightedSpanTermExtractor {
q = spanQuery;
}
LeafReaderContext context = getLeafContext();
- SpanWeight w = (SpanWeight) searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
+ SpanWeight w =
+ (SpanWeight) searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
Bits acceptDocs = context.reader().getLiveDocs();
final Spans spans = w.getSpans(context, SpanWeight.Postings.POSITIONS);
if (spans == null) {
@@ -348,7 +352,7 @@ public class WeightedSpanTermExtractor {
return;
}
- for (final Term queryTerm : nonWeightedTerms) {
+ for (final Term queryTerm : nonWeightedTerms) {
if (fieldNameComparator(queryTerm.field())) {
WeightedSpanTerm weightedSpanTerm = terms.get(queryTerm.text());
@@ -368,15 +372,15 @@ public class WeightedSpanTermExtractor {
}
/**
- * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied <code>Query</code>.
- *
- * @param terms
- * Map to place created WeightedSpanTerms in
- * @param query
- * Query to extract Terms from
+ * Fills a <code>Map</code> with {@link WeightedSpanTerm}s using the terms from the supplied
+ * <code>Query</code>.
+ *
+ * @param terms Map to place created WeightedSpanTerms in
+ * @param query Query to extract Terms from
* @throws IOException If there is a low-level I/O error
*/
- protected void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query, float boost) throws IOException {
+ protected void extractWeightedTerms(Map<String, WeightedSpanTerm> terms, Query query, float boost)
+ throws IOException {
Set<Term> nonWeightedTerms = new HashSet<>();
final IndexSearcher searcher = new IndexSearcher(getLeafContext());
searcher.rewrite(query).visit(QueryVisitor.termCollector(nonWeightedTerms));
@@ -390,12 +394,12 @@ public class WeightedSpanTermExtractor {
}
}
- /**
- * Necessary to implement matches for queries against <code>defaultField</code>
- */
+ /** Necessary to implement matches for queries against <code>defaultField</code> */
protected boolean fieldNameComparator(String fieldNameToCheck) {
- boolean rv = fieldName == null || fieldName.equals(fieldNameToCheck)
- || (defaultField != null && defaultField.equals(fieldNameToCheck));
+ boolean rv =
+ fieldName == null
+ || fieldName.equals(fieldNameToCheck)
+ || (defaultField != null && defaultField.equals(fieldNameToCheck));
return rv;
}
@@ -408,20 +412,23 @@ public class WeightedSpanTermExtractor {
cacheIt = false;
Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
- internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
+ internalReader =
+ new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
}
}
// Use MemoryIndex (index/invert this tokenStream now)
if (internalReader == null) {
- final MemoryIndex indexer = new MemoryIndex(true, usePayloads);//offsets and payloads
+ final MemoryIndex indexer = new MemoryIndex(true, usePayloads); // offsets and payloads
if (cacheIt) {
assert !cachedTokenStream;
- tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
+ tokenStream =
+ new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
} else {
- indexer.addField(DelegatingLeafReader.FIELD_NAME,
+ indexer.addField(
+ DelegatingLeafReader.FIELD_NAME,
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
}
final IndexSearcher searcher = indexer.createSearcher();
@@ -429,13 +436,13 @@ public class WeightedSpanTermExtractor {
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
}
- //Now wrap it so we always use a common field.
+ // Now wrap it so we always use a common field.
this.internalReader = new DelegatingLeafReader(internalReader);
}
return internalReader.getContext();
}
-
+
/*
* This reader will just delegate every call to a single field in the wrapped
* LeafReader. This way we only need to build this field once rather than
@@ -450,7 +457,7 @@ public class WeightedSpanTermExtractor {
@Override
public FieldInfos getFieldInfos() {
- throw new UnsupportedOperationException();//TODO merge them
+ throw new UnsupportedOperationException(); // TODO merge them
}
@Override
@@ -462,17 +469,17 @@ public class WeightedSpanTermExtractor {
public NumericDocValues getNumericDocValues(String field) throws IOException {
return super.getNumericDocValues(FIELD_NAME);
}
-
+
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
return super.getBinaryDocValues(FIELD_NAME);
}
-
+
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
return super.getSortedDocValues(FIELD_NAME);
}
-
+
@Override
public NumericDocValues getNormValues(String field) throws IOException {
return super.getNormValues(FIELD_NAME);
@@ -490,41 +497,38 @@ public class WeightedSpanTermExtractor {
}
/**
- * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
- *
+ * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>
+ * TokenStream</code>.
+ *
* <p>
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
+ *
+ * @param query that caused hit
+ * @param tokenStream of text to be highlighted
* @return Map containing WeightedSpanTerms
* @throws IOException If there is a low-level I/O error
*/
- public Map<String,WeightedSpanTerm> getWeightedSpanTerms(Query query, float boost, TokenStream tokenStream)
- throws IOException {
+ public Map<String, WeightedSpanTerm> getWeightedSpanTerms(
+ Query query, float boost, TokenStream tokenStream) throws IOException {
return getWeightedSpanTerms(query, boost, tokenStream, null);
}
/**
- * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
- *
+ * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>
+ * TokenStream</code>.
+ *
* <p>
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
- * @param fieldName
- * restricts Term's used based on field name
+ *
+ * @param query that caused hit
+ * @param tokenStream of text to be highlighted
+ * @param fieldName restricts Term's used based on field name
* @return Map containing WeightedSpanTerms
* @throws IOException If there is a low-level I/O error
*/
- public Map<String,WeightedSpanTerm> getWeightedSpanTerms(Query query, float boost, TokenStream tokenStream,
- String fieldName) throws IOException {
+ public Map<String, WeightedSpanTerm> getWeightedSpanTerms(
+ Query query, float boost, TokenStream tokenStream, String fieldName) throws IOException {
this.fieldName = fieldName;
- Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
+ Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<>();
this.tokenStream = tokenStream;
try {
extract(query, boost, terms);
@@ -536,24 +540,22 @@ public class WeightedSpanTermExtractor {
}
/**
- * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
- * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
- *
+ * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>
+ * TokenStream</code>. Uses a supplied <code>IndexReader</code> to properly weight terms (for
+ * gradient highlighting).
+ *
* <p>
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
- * @param fieldName
- * restricts Term's used based on field name
- * @param reader
- * to use for scoring
+ *
+ * @param query that caused hit
+ * @param tokenStream of text to be highlighted
+ * @param fieldName restricts Term's used based on field name
+ * @param reader to use for scoring
* @return Map of WeightedSpanTerms with quasi tf/idf scores
* @throws IOException If there is a low-level I/O error
*/
- public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName,
- IndexReader reader) throws IOException {
+ public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(
+ Query query, float boost, TokenStream tokenStream, String fieldName, IndexReader reader)
+ throws IOException {
if (fieldName != null) {
this.fieldName = fieldName;
} else {
@@ -561,7 +563,7 @@ public class WeightedSpanTermExtractor {
}
this.tokenStream = tokenStream;
- Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
+ Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<>();
extract(query, boost, terms);
int totalNumDocs = reader.maxDoc();
@@ -585,65 +587,66 @@ public class WeightedSpanTermExtractor {
protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
if (spanQuery instanceof FieldMaskingSpanQuery) {
- collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
+ collectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).getMaskedQuery(), fieldNames);
} else if (spanQuery instanceof SpanFirstQuery) {
- collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames);
+ collectSpanQueryFields(((SpanFirstQuery) spanQuery).getMatch(), fieldNames);
} else if (spanQuery instanceof SpanNearQuery) {
- for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
+ for (final SpanQuery clause : ((SpanNearQuery) spanQuery).getClauses()) {
collectSpanQueryFields(clause, fieldNames);
}
} else if (spanQuery instanceof SpanNotQuery) {
- collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames);
+ collectSpanQueryFields(((SpanNotQuery) spanQuery).getInclude(), fieldNames);
} else if (spanQuery instanceof SpanOrQuery) {
- for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
+ for (final SpanQuery clause : ((SpanOrQuery) spanQuery).getClauses()) {
collectSpanQueryFields(clause, fieldNames);
}
} else {
fieldNames.add(spanQuery.getField());
}
}
-
+
protected boolean mustRewriteQuery(SpanQuery spanQuery) {
if (!expandMultiTermQuery) {
return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
} else if (spanQuery instanceof FieldMaskingSpanQuery) {
- return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery());
+ return mustRewriteQuery(((FieldMaskingSpanQuery) spanQuery).getMaskedQuery());
} else if (spanQuery instanceof SpanFirstQuery) {
- return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch());
+ return mustRewriteQuery(((SpanFirstQuery) spanQuery).getMatch());
} else if (spanQuery instanceof SpanNearQuery) {
- for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
+ for (final SpanQuery clause : ((SpanNearQuery) spanQuery).getClauses()) {
if (mustRewriteQuery(clause)) {
return true;
}
}
- return false;
+ return false;
} else if (spanQuery instanceof SpanNotQuery) {
- SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery;
- return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude());
+ SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery;
+ return mustRewriteQuery(spanNotQuery.getInclude())
+ || mustRewriteQuery(spanNotQuery.getExclude());
} else if (spanQuery instanceof SpanOrQuery) {
- for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
+ for (final SpanQuery clause : ((SpanOrQuery) spanQuery).getClauses()) {
if (mustRewriteQuery(clause)) {
return true;
}
}
- return false;
+ return false;
} else if (spanQuery instanceof SpanTermQuery) {
return false;
} else {
return true;
}
}
-
+
/**
- * This class makes sure that if both position sensitive and insensitive
- * versions of the same term are added, the position insensitive one wins.
+ * This class makes sure that if both position sensitive and insensitive versions of the same term
+ * are added, the position insensitive one wins.
*/
@SuppressWarnings("serial")
- protected static class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {
+ protected static class PositionCheckingMap<K> extends HashMap<K, WeightedSpanTerm> {
@Override
- public void putAll(Map<? extends K,? extends WeightedSpanTerm> m) {
- for (Map.Entry<? extends K,? extends WeightedSpanTerm> entry : m.entrySet())
+ public void putAll(Map<? extends K, ? extends WeightedSpanTerm> m) {
+ for (Map.Entry<? extends K, ? extends WeightedSpanTerm> entry : m.entrySet())
this.put(entry.getKey(), entry.getValue());
}
@@ -658,9 +661,8 @@ public class WeightedSpanTermExtractor {
}
return prev;
}
-
}
-
+
public boolean getExpandMultiTermQuery() {
return expandMultiTermQuery;
}
@@ -681,28 +683,30 @@ public class WeightedSpanTermExtractor {
return cachedTokenStream;
}
- /** Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
- * getWeightedSpanTerms* sets the tokenStream, so don't call this before. */
+ /**
+ * Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
+ * getWeightedSpanTerms* sets the tokenStream, so don't call this before.
+ */
public TokenStream getTokenStream() {
assert tokenStream != null;
return tokenStream;
}
-
+
/**
- * By default, {@link TokenStream}s that are not of the type
- * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
- * ensure an efficient reset - if you are already using a different caching
- * {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false. This setting is ignored when a term vector based TokenStream is supplied,
- * since it can be reset efficiently.
+ * By default, {@link TokenStream}s that are not of the type {@link CachingTokenFilter} are
+ * wrapped in a {@link CachingTokenFilter} to ensure an efficient reset - if you are already using
+ * a different caching {@link TokenStream} impl and you don't want it to be wrapped, set this to
+ * false. This setting is ignored when a term vector based TokenStream is supplied, since it can
+ * be reset efficiently.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
- /** A threshold of number of characters to analyze. When a TokenStream based on
- * term vectors with offsets and positions are supplied, this setting
- * does not apply. */
+ /**
+ * A threshold of number of characters to analyze. When a TokenStream based on term vectors with
+ * offsets and positions are supplied, this setting does not apply.
+ */
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
index 7fba55e..adaad11 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
@@ -15,49 +15,33 @@
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
-/** Lightweight class to hold term and a weight value used for scoring this term
- */
-public class WeightedTerm
-{
+/** Lightweight class to hold term and a weight value used for scoring this term */
+public class WeightedTerm {
float weight; // multiplier
- String term; //stemmed form
- public WeightedTerm (float weight,String term)
- {
- this.weight=weight;
- this.term=term;
- }
+ String term; // stemmed form
+ public WeightedTerm(float weight, String term) {
+ this.weight = weight;
+ this.term = term;
+ }
- /**
- * @return the term value (stemmed)
- */
- public String getTerm()
- {
+ /** @return the term value (stemmed) */
+ public String getTerm() {
return term;
}
- /**
- * @return the weight associated with this term
- */
- public float getWeight()
- {
+ /** @return the weight associated with this term */
+ public float getWeight() {
return weight;
}
- /**
- * @param term the term value (stemmed)
- */
- public void setTerm(String term)
- {
+ /** @param term the term value (stemmed) */
+ public void setTerm(String term) {
this.term = term;
}
- /**
- * @param weight the weight associated with this term
- */
- public void setWeight(float weight)
- {
+ /** @param weight the weight associated with this term */
+ public void setWeight(float weight) {
this.weight = weight;
}
-
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/package-info.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/package-info.java
index ba6e21a..a9ab1ca 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/package-info.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/package-info.java
@@ -17,13 +17,12 @@
/**
* Highlighting search terms.
- * <p>
- * The highlight package contains classes to provide "keyword in context" features
- * typically used to highlight search terms in the text of results pages.
- * The Highlighter class is the central component and can be used to extract the
- * most interesting sections of a piece of text and highlight them, with the help of
- * Fragmenter, fragment Scorer, and Formatter classes.
- *
+ *
+ * <p>The highlight package contains classes to provide "keyword in context" features typically used
+ * to highlight search terms in the text of results pages. The Highlighter class is the central
+ * component and can be used to extract the most interesting sections of a piece of text and
+ * highlight them, with the help of Fragmenter, fragment Scorer, and Formatter classes.
+ *
* <h2>Example Usage</h2>
*
* <pre class="prettyprint">
@@ -31,9 +30,9 @@
* IndexSearcher searcher = new IndexSearcher(directory);
* QueryParser parser = new QueryParser("notv", analyzer);
* Query query = parser.parse("million");
- *
+ *
* TopDocs hits = searcher.search(query, 10);
- *
+ *
* SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
* Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
* for (int i = 0; i < 10; i++) {
@@ -59,37 +58,38 @@
* System.out.println("-------------");
* }
* </pre>
- *
+ *
* <h2>New features 06/02/2005</h2>
- *
- * This release adds options for encoding (thanks to Nicko Cadell).
- * An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
- * all those non-xhtml standard characters such as & into legal values. This simple class may not suffice for
- * some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in
+ *
+ * This release adds options for encoding (thanks to Nicko Cadell). An "Encoder" implementation such
+ * as the new SimpleHTMLEncoder class can be passed to the highlighter to encode all those non-xhtml
+ * standard characters such as & into legal values. This simple class may not suffice for some
+ * languages - Commons Lang has an implementation that could be used: escapeHtml(String) in
* http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
- *
+ *
* <h2>New features 22/12/2004</h2>
- *
+ *
* This release adds some new capabilities:
+ *
* <ol>
- * <li>Faster highlighting using Term vector support</li>
- * <li>New formatting options to use color intensity to show informational value</li>
- * <li>Options for better summarization by using term IDF scores to influence fragment selection</li>
+ * <li>Faster highlighting using Term vector support
+ * <li>New formatting options to use color intensity to show informational value
+ * <li>Options for better summarization by using term IDF scores to influence fragment selection
* </ol>
- *
- * <p>
- * The highlighter takes a TokenStream as input. Until now these streams have typically been produced
- * using an Analyzer but the new class TokenSources provides helper methods for obtaining TokenStreams from
- * the new TermVector position support (see latest CVS version).</p>
- *
- * <p>The new class GradientFormatter can use a scale of colors to highlight terms according to their score.
- * A subtle use of color can help emphasise the reasons for matching (useful when doing "MoreLikeThis" queries and
- * you want to see what the basis of the similarities are).</p>
- *
- * <p>The QueryScorer class has constructors that use an IndexReader to derive the IDF (inverse document frequency)
- * for each term in order to influence the score. This is useful for helping to extracting the most significant sections
- * of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly.
- * The QueryScorer.getMaxTermWeight method is useful when passed to the GradientFormatter constructor to define the top score
- * which is associated with the top color.</p>
+ *
+ * <p>The highlighter takes a TokenStream as input. Until now these streams have typically been
+ * produced using an Analyzer but the new class TokenSources provides helper methods for obtaining
+ * TokenStreams from the new TermVector position support (see latest CVS version).
+ *
+ * <p>The new class GradientFormatter can use a scale of colors to highlight terms according to
+ * their score. A subtle use of color can help emphasise the reasons for matching (useful when doing
+ * "MoreLikeThis" queries and you want to see what the basis of the similarities are).
+ *
+ * <p>The QueryScorer class has constructors that use an IndexReader to derive the IDF (inverse
+ * document frequency) for each term in order to influence the score. This is useful for helping to
+ * extracting the most significant sections of a document and in supplying scores used by the new
+ * GradientFormatter to color significant words more strongly. The QueryScorer.getMaxTermWeight
+ * method is useful when passed to the GradientFormatter constructor to define the top score which
+ * is associated with the top color.
*/
package org.apache.lucene.search.highlight;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java
index 21166da..744ecf2 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java
@@ -20,8 +20,8 @@ import java.text.BreakIterator;
import java.util.Locale;
/**
- * A {@link PassageAdjuster} that adjusts the {@link Passage} range to
- * word boundaries hinted by the given {@link BreakIterator}.
+ * A {@link PassageAdjuster} that adjusts the {@link Passage} range to word boundaries hinted by the
+ * given {@link BreakIterator}.
*/
public class BreakIteratorShrinkingAdjuster implements PassageAdjuster {
private final BreakIterator bi;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java
index 701717f..40cb8cba 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java
@@ -18,9 +18,7 @@ package org.apache.lucene.search.matchhighlight;
import java.text.CharacterIterator;
-/**
- * A {@link CharacterIterator} over a {@link CharSequence}.
- */
+/** A {@link CharacterIterator} over a {@link CharSequence}. */
final class CharSequenceIterator implements CharacterIterator {
private final CharSequence text;
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
index ece6693..d04318a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
@@ -26,16 +26,17 @@ import java.util.function.BiPredicate;
import java.util.function.Predicate;
/**
- * A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes
- * that cover typical use cases (verbatim values, highlights, abbreviations).
+ * A factory of {@link
+ * org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes that
+ * cover typical use cases (verbatim values, highlights, abbreviations).
*
* @see MatchHighlighter#appendFieldHighlighter
*/
public final class FieldValueHighlighters {
- private FieldValueHighlighters() {
- }
+ private FieldValueHighlighters() {}
- private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter {
+ private abstract static class AbstractFieldValueHighlighter
+ implements MatchHighlighter.FieldValueHighlighter {
private final BiPredicate<String, Boolean> testPredicate;
protected AbstractFieldValueHighlighter(BiPredicate<String, Boolean> testPredicate) {
@@ -49,18 +50,24 @@ public final class FieldValueHighlighters {
}
/**
- * Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained
- * highlights or not.
+ * Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it
+ * contained highlights or not.
*/
- public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set<String> fields) {
+ public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(
+ int maxLeadingCharacters, String ellipsis, Set<String> fields) {
PassageSelector passageSelector = defaultPassageSelector();
PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", "");
return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) {
@Override
- public List<String> format(String field, String[] values, String contiguousValue,
- List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ public List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
List<Passage> bestPassages =
- passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
+ passageSelector.pickBest(
+ contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
@@ -72,13 +79,10 @@ public final class FieldValueHighlighters {
};
}
- /**
- * Default preconfigured {@link PassageSelector}.
- */
+ /** Default preconfigured {@link PassageSelector}. */
public static PassageSelector defaultPassageSelector() {
return new PassageSelector(
- PassageSelector.DEFAULT_SCORER,
- new BreakIteratorShrinkingAdjuster());
+ PassageSelector.DEFAULT_SCORER, new BreakIteratorShrinkingAdjuster());
}
/**
@@ -90,24 +94,29 @@ public final class FieldValueHighlighters {
PassageFormatter passageFormatter,
Predicate<String> matchFields) {
PassageSelector passageSelector = defaultPassageSelector();
- return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) {
+ return new AbstractFieldValueHighlighter(
+ (field, hasMatches) -> matchFields.test(field) && hasMatches) {
@Override
- public List<String> format(String field, String[] values, String contiguousValue,
- List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ public List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
assert matchOffsets != null;
List<Passage> bestPassages =
- passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
+ passageSelector.pickBest(
+ contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
};
}
- /**
- * Always returns raw field values, no highlighting or value truncation is applied.
- */
- public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) {
+ /** Always returns raw field values, no highlighting or value truncation is applied. */
+ public static MatchHighlighter.FieldValueHighlighter verbatimValue(
+ String field, String... moreFields) {
HashSet<String> matchFields = new HashSet<>(Arrays.asList(moreFields));
matchFields.add(field);
return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) {
@@ -117,21 +126,30 @@ public final class FieldValueHighlighters {
}
@Override
- public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
- List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ public List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
return Arrays.asList(values);
}
};
}
/**
- * Matches all fields and omits their value in the output (so that no highlight or value is emitted).
+ * Matches all fields and omits their value in the output (so that no highlight or value is
+ * emitted).
*/
public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
return new AbstractFieldValueHighlighter((field, hasMatches) -> true) {
@Override
- public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
- List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+ public List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
return null;
}
};
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
index 277a324..378c557 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
@@ -16,17 +16,6 @@
*/
package org.apache.lucene.search.matchhighlight;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.DocumentStoredFieldVisitor;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TopDocs;
-
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
@@ -39,14 +28,24 @@ import java.util.Map;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.stream.Stream;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DocumentStoredFieldVisitor;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
/**
- * An example highlighter that combines several lower-level highlighting
- * utilities in this package into a fully featured, ready-to-use component.
- * <p>
- * Note that if you need to customize or tweak the details of highlighting,
- * it is better to assemble your own highlighter using those low-level
- * building blocks, rather than extend or modify this one.
+ * An example highlighter that combines several lower-level highlighting utilities in this package
+ * into a fully featured, ready-to-use component.
+ *
+ * <p>Note that if you need to customize or tweak the details of highlighting, it is better to
+ * assemble your own highlighter using those low-level building blocks, rather than extend or modify
+ * this one.
*/
public class MatchHighlighter {
private final IndexSearcher searcher;
@@ -57,10 +56,9 @@ public class MatchHighlighter {
private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
/**
- * Actual per-field highlighter. Field highlighters are probed whether they
- * are applicable to a particular combination of (field, hasMatches) pair. If a highlighter
- * declares it is applicable, its {@link #format} method is invoked and the result
- * is returned as the field's value.
+ * Actual per-field highlighter. Field highlighters are probed whether they are applicable to a
+ * particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable,
+ * its {@link #format} method is invoked and the result is returned as the field's value.
*
* @see FieldValueHighlighters
*/
@@ -73,24 +71,24 @@ public class MatchHighlighter {
*/
boolean isApplicable(String field, boolean hasMatches);
- /**
- * Do format field values appropriately.
- */
- List<String> format(String field, String[] values, String contiguousValue,
- List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets);
+ /** Do format field values appropriately. */
+ List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<QueryOffsetRange> matchOffsets);
/**
- * @return Returns a set of fields that must be fetched for each document, regardless
- * of whether they had matches or not. This is useful to load and return certain fields
- * that should always be included (identifiers, document titles, etc.).
+ * @return Returns a set of fields that must be fetched for each document, regardless of whether
+ * they had matches or not. This is useful to load and return certain fields that should
+ * always be included (identifiers, document titles, etc.).
*/
default Collection<String> alwaysFetchedFields() {
return Collections.emptyList();
}
- /**
- * Returns a new field value highlighter that is a combination of this one and another one.
- */
+ /** Returns a new field value highlighter that is a combination of this one and another one. */
default FieldValueHighlighter or(FieldValueHighlighter other) {
FieldValueHighlighter first = this;
FieldValueHighlighter second = other;
@@ -102,15 +100,20 @@ public class MatchHighlighter {
return new FieldValueHighlighter() {
@Override
public boolean isApplicable(String field, boolean hasMatches) {
- return first.isApplicable(field, hasMatches)
- || second.isApplicable(field, hasMatches);
+ return first.isApplicable(field, hasMatches) || second.isApplicable(field, hasMatches);
}
@Override
- public List<String> format(String field, String[] values, String contiguousValue,
- List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets) {
+ public List<String> format(
+ String field,
+ String[] values,
+ String contiguousValue,
+ List<OffsetRange> valueRanges,
+ List<QueryOffsetRange> matchOffsets) {
FieldValueHighlighter delegate =
- first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second;
+ first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty())
+ ? first
+ : second;
return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
}
@@ -123,8 +126,8 @@ public class MatchHighlighter {
}
/**
- * Append a new highlighter to field highlighters chain. The order of field highlighters
- * is important (first-matching wins).
+ * Append a new highlighter to field highlighters chain. The order of field highlighters is
+ * important (first-matching wins).
*/
public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
fieldHighlighters.add(highlighter);
@@ -132,18 +135,14 @@ public class MatchHighlighter {
return this;
}
- /**
- * Always fetch the given set of fields for all input documents.
- */
+ /** Always fetch the given set of fields for all input documents. */
public void alwaysFetchFields(String... fields) {
for (String fld : fields) {
fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
}
}
- /**
- * Single document's highlights.
- */
+ /** Single document's highlights. */
public static class DocHighlights {
public final int docId;
public final Map<String, List<String>> fields = new LinkedHashMap<>();
@@ -153,9 +152,7 @@ public class MatchHighlighter {
}
}
- /**
- * An {@link OffsetRange} of a match, together with the source query that caused it.
- */
+ /** An {@link OffsetRange} of a match, together with the source query that caused it. */
public static class QueryOffsetRange extends OffsetRange {
public final Query query;
@@ -174,8 +171,7 @@ public class MatchHighlighter {
final int docId;
private final LeafReader leafReader;
private final int leafDocId;
- private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges
- = new LinkedHashMap<>();
+ private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();
DocHit(int docId, LeafReader leafReader, int leafDocId) {
this.docId = docId;
@@ -184,21 +180,25 @@ public class MatchHighlighter {
}
void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
- hits.forEach((field, offsets) -> {
- List<QueryOffsetRange> target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
- offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
- });
+ hits.forEach(
+ (field, offsets) -> {
+ List<QueryOffsetRange> target =
+ matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
+ offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
+ });
}
Document document(Predicate<String> needsField) throws IOException {
// Only load the fields that have a chance to be highlighted.
- DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() {
- @Override
- public Status needsField(FieldInfo fieldInfo) {
- return (matchRanges.containsKey(fieldInfo.name) ||
- needsField.test(fieldInfo.name)) ? Status.YES : Status.NO;
- }
- };
+ DocumentStoredFieldVisitor visitor =
+ new DocumentStoredFieldVisitor() {
+ @Override
+ public Status needsField(FieldInfo fieldInfo) {
+ return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
+ ? Status.YES
+ : Status.NO;
+ }
+ };
leafReader.document(leafDocId, visitor);
return visitor.getDocument();
@@ -206,12 +206,16 @@ public class MatchHighlighter {
}
public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
- this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+ this(
+ searcher,
+ analyzer,
+ MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
- public MatchHighlighter(IndexSearcher searcher,
- Analyzer analyzer,
- OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
+ public MatchHighlighter(
+ IndexSearcher searcher,
+ Analyzer analyzer,
+ OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
this.searcher = searcher;
this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
this.analyzer = analyzer;
@@ -229,8 +233,12 @@ public class MatchHighlighter {
for (Query q : queries) {
MatchRegionRetriever highlighter =
new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
- highlighter.highlightDocuments(topDocs,
- (int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) -> {
+ highlighter.highlightDocuments(
+ topDocs,
+ (int docId,
+ LeafReader leafReader,
+ int leafDocId,
+ Map<String, List<OffsetRange>> hits) -> {
DocHit docHit = docHits.get(docId);
if (docHit == null) {
docHit = new DocHit(docId, leafReader, leafDocId);
@@ -267,8 +275,9 @@ public class MatchHighlighter {
List<OffsetRange> valueRanges = computeValueRanges(field, values);
List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
- List<String> formattedValues = fieldValueHighlighter(field, offsets != null)
- .format(field, values, contiguousValue, valueRanges, offsets);
+ List<String> formattedValues =
+ fieldValueHighlighter(field, offsets != null)
+ .format(field, values, contiguousValue, valueRanges, offsets);
if (formattedValues != null) {
docHighlights.fields.put(field, formattedValues);
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
index 3f07aca..b71f37a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
@@ -16,6 +16,18 @@
*/
package org.apache.lucene.search.matchhighlight;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FieldInfo;
@@ -32,22 +44,9 @@ import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.PrimitiveIterator;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.function.Predicate;
-
/**
- * Utility class to compute a list of "match regions" for a given query, searcher and
- * document(s) using {@link Matches} API.
+ * Utility class to compute a list of "match regions" for a given query, searcher and document(s)
+ * using {@link Matches} API.
*/
public class MatchRegionRetriever {
private final List<LeafReaderContext> leaves;
@@ -58,20 +57,21 @@ public class MatchRegionRetriever {
/**
* A callback for accepting a single document (and its associated leaf reader, leaf document ID)
- * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for
- * the query.
+ * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the
+ * query.
*/
@FunctionalInterface
public interface MatchOffsetsConsumer {
- void accept(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
+ void accept(
+ int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
throws IOException;
}
/**
- * An abstraction that provides document values for a given field. Default implementation
- * in {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is
- * possible to write a more efficient implementation on top of a reusable character buffer
- * (that reuses the buffer while retrieving hit regions for documents).
+ * An abstraction that provides document values for a given field. Default implementation in
+ * {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is possible
+ * to write a more efficient implementation on top of a reusable character buffer (that reuses the
+ * buffer while retrieving hit regions for documents).
*/
@FunctionalInterface
public interface FieldValueProvider {
@@ -81,23 +81,26 @@ public class MatchRegionRetriever {
/**
* A constructor with the default offset strategy supplier.
*
- * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
- * in the absence of position offsets in the index. Note that the analyzer must return
- * tokens (positions and offsets) identical to the ones stored in the index.
+ * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
+ * absence of position offsets in the index. Note that the analyzer must return tokens
+ * (positions and offsets) identical to the ones stored in the index.
*/
- public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
+ public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer)
+ throws IOException {
this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
... 24691 lines suppressed ...