You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2020/12/28 11:27:05 UTC
[lucene-solr] 02/03: LUCENE-9570: code reformatting [partial].
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 8ef6a0da56878177ff8d6880c92e8f7d0321d076
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Mon Dec 28 12:26:13 2020 +0100
LUCENE-9570: code reformatting [partial].
---
gradle/generation/jflex.gradle | 46 +-
gradle/validation/spotless.gradle | 43 +-
.../lucene/analysis/bg/BulgarianAnalyzer.java | 76 +-
.../lucene/analysis/bg/BulgarianStemFilter.java | 24 +-
.../analysis/bg/BulgarianStemFilterFactory.java | 12 +-
.../lucene/analysis/bg/BulgarianStemmer.java | 86 +-
.../apache/lucene/analysis/bg/package-info.java | 4 +-
.../apache/lucene/analysis/bn/BengaliAnalyzer.java | 65 +-
.../analysis/bn/BengaliNormalizationFilter.java | 25 +-
.../bn/BengaliNormalizationFilterFactory.java | 14 +-
.../lucene/analysis/bn/BengaliNormalizer.java | 72 +-
.../lucene/analysis/bn/BengaliStemFilter.java | 15 +-
.../analysis/bn/BengaliStemFilterFactory.java | 16 +-
.../apache/lucene/analysis/bn/BengaliStemmer.java | 255 +-
.../apache/lucene/analysis/bn/package-info.java | 4 +-
.../analysis/boost/DelimitedBoostTokenFilter.java | 14 +-
.../boost/DelimitedBoostTokenFilterFactory.java | 16 +-
.../apache/lucene/analysis/boost/package-info.java | 4 +-
.../lucene/analysis/br/BrazilianAnalyzer.java | 72 +-
.../lucene/analysis/br/BrazilianStemFilter.java | 31 +-
.../analysis/br/BrazilianStemFilterFactory.java | 14 +-
.../lucene/analysis/br/BrazilianStemmer.java | 1202 ++++---
.../apache/lucene/analysis/br/package-info.java | 4 +-
.../apache/lucene/analysis/ca/CatalanAnalyzer.java | 66 +-
.../apache/lucene/analysis/ca/package-info.java | 4 +-
.../lucene/analysis/charfilter/BaseCharFilter.java | 49 +-
.../charfilter/HTMLStripCharFilterFactory.java | 11 +-
.../analysis/charfilter/MappingCharFilter.java | 26 +-
.../charfilter/MappingCharFilterFactory.java | 80 +-
.../analysis/charfilter/NormalizeCharMap.java | 46 +-
.../lucene/analysis/charfilter/package-info.java | 61 +-
.../apache/lucene/analysis/cjk/CJKAnalyzer.java | 35 +-
.../lucene/analysis/cjk/CJKBigramFilter.java | 191 +-
.../analysis/cjk/CJKBigramFilterFactory.java | 16 +-
.../lucene/analysis/cjk/CJKWidthCharFilter.java | 67 +-
.../analysis/cjk/CJKWidthCharFilterFactory.java | 8 +-
.../apache/lucene/analysis/cjk/CJKWidthFilter.java | 75 +-
.../lucene/analysis/cjk/CJKWidthFilterFactory.java | 14 +-
.../apache/lucene/analysis/cjk/package-info.java | 29 +-
.../apache/lucene/analysis/ckb/SoraniAnalyzer.java | 57 +-
.../analysis/ckb/SoraniNormalizationFilter.java | 9 +-
.../ckb/SoraniNormalizationFilterFactory.java | 8 +-
.../lucene/analysis/ckb/SoraniNormalizer.java | 28 +-
.../lucene/analysis/ckb/SoraniStemFilter.java | 24 +-
.../analysis/ckb/SoraniStemFilterFactory.java | 8 +-
.../apache/lucene/analysis/ckb/SoraniStemmer.java | 52 +-
.../apache/lucene/analysis/ckb/package-info.java | 4 +-
.../lucene/analysis/classic/ClassicAnalyzer.java | 60 +-
.../lucene/analysis/classic/ClassicFilter.java | 28 +-
.../analysis/classic/ClassicFilterFactory.java | 12 +-
.../lucene/analysis/classic/ClassicTokenizer.java | 99 +-
.../analysis/classic/ClassicTokenizerFactory.java | 7 +-
.../analysis/classic/ClassicTokenizerImpl.java | 706 ++--
.../analysis/classic/ClassicTokenizerImpl.jflex | 2 +-
.../lucene/analysis/classic/package-info.java | 19 +-
.../analysis/commongrams/CommonGramsFilter.java | 80 +-
.../commongrams/CommonGramsFilterFactory.java | 12 +-
.../commongrams/CommonGramsQueryFilter.java | 62 +-
.../commongrams/CommonGramsQueryFilterFactory.java | 10 +-
.../lucene/analysis/commongrams/package-info.java | 4 +-
.../compound/CompoundWordTokenFilterBase.java | 78 +-
.../DictionaryCompoundWordTokenFilter.java | 89 +-
.../DictionaryCompoundWordTokenFilterFactory.java | 24 +-
.../HyphenationCompoundWordTokenFilter.java | 118 +-
.../HyphenationCompoundWordTokenFilterFactory.java | 58 +-
.../analysis/compound/hyphenation/ByteVector.java | 31 +-
.../analysis/compound/hyphenation/CharVector.java | 33 +-
.../analysis/compound/hyphenation/Hyphen.java | 24 +-
.../analysis/compound/hyphenation/Hyphenation.java | 17 +-
.../compound/hyphenation/HyphenationTree.java | 179 +-
.../compound/hyphenation/PatternConsumer.java | 25 +-
.../compound/hyphenation/PatternParser.java | 67 +-
.../analysis/compound/hyphenation/TernaryTree.java | 183 +-
.../compound/hyphenation/package-info.java | 9 +-
.../lucene/analysis/compound/package-info.java | 123 +-
.../lucene/analysis/core/DecimalDigitFilter.java | 16 +-
.../analysis/core/DecimalDigitFilterFactory.java | 12 +-
.../lucene/analysis/core/FlattenGraphFilter.java | 143 +-
.../analysis/core/FlattenGraphFilterFactory.java | 11 +-
.../lucene/analysis/core/KeywordAnalyzer.java | 8 +-
.../lucene/analysis/core/KeywordTokenizer.java | 37 +-
.../analysis/core/KeywordTokenizerFactory.java | 31 +-
.../lucene/analysis/core/LetterTokenizer.java | 47 +-
.../analysis/core/LetterTokenizerFactory.java | 26 +-
.../lucene/analysis/core/LowerCaseFilter.java | 13 +-
.../analysis/core/LowerCaseFilterFactory.java | 11 +-
.../lucene/analysis/core/SimpleAnalyzer.java | 16 +-
.../apache/lucene/analysis/core/StopAnalyzer.java | 41 +-
.../apache/lucene/analysis/core/StopFilter.java | 20 +-
.../lucene/analysis/core/StopFilterFactory.java | 62 +-
.../lucene/analysis/core/TypeTokenFilter.java | 24 +-
.../analysis/core/TypeTokenFilterFactory.java | 18 +-
.../analysis/core/UnicodeWhitespaceAnalyzer.java | 14 +-
.../analysis/core/UnicodeWhitespaceTokenizer.java | 37 +-
.../lucene/analysis/core/UpperCaseFilter.java | 20 +-
.../analysis/core/UpperCaseFilterFactory.java | 21 +-
.../lucene/analysis/core/WhitespaceAnalyzer.java | 12 +-
.../lucene/analysis/core/WhitespaceTokenizer.java | 38 +-
.../analysis/core/WhitespaceTokenizerFactory.java | 28 +-
.../apache/lucene/analysis/core/package-info.java | 4 +-
.../lucene/analysis/custom/CustomAnalyzer.java | 467 +--
.../lucene/analysis/custom/package-info.java | 4 +-
.../apache/lucene/analysis/cz/CzechAnalyzer.java | 61 +-
.../apache/lucene/analysis/cz/CzechStemFilter.java | 24 +-
.../lucene/analysis/cz/CzechStemFilterFactory.java | 12 +-
.../apache/lucene/analysis/cz/CzechStemmer.java | 141 +-
.../apache/lucene/analysis/cz/package-info.java | 4 +-
.../apache/lucene/analysis/da/DanishAnalyzer.java | 57 +-
.../apache/lucene/analysis/da/package-info.java | 4 +-
.../apache/lucene/analysis/de/GermanAnalyzer.java | 93 +-
.../lucene/analysis/de/GermanLightStemFilter.java | 16 +-
.../analysis/de/GermanLightStemFilterFactory.java | 12 +-
.../lucene/analysis/de/GermanLightStemmer.java | 125 +-
.../analysis/de/GermanMinimalStemFilter.java | 16 +-
.../de/GermanMinimalStemFilterFactory.java | 12 +-
.../lucene/analysis/de/GermanMinimalStemmer.java | 99 +-
.../analysis/de/GermanNormalizationFilter.java | 37 +-
.../de/GermanNormalizationFilterFactory.java | 12 +-
.../lucene/analysis/de/GermanStemFilter.java | 93 +-
.../analysis/de/GermanStemFilterFactory.java | 16 +-
.../apache/lucene/analysis/de/GermanStemmer.java | 412 ++-
.../apache/lucene/analysis/de/package-info.java | 4 +-
.../apache/lucene/analysis/el/GreekAnalyzer.java | 66 +-
.../lucene/analysis/el/GreekLowerCaseFilter.java | 66 +-
.../analysis/el/GreekLowerCaseFilterFactory.java | 14 +-
.../apache/lucene/analysis/el/GreekStemFilter.java | 33 +-
.../lucene/analysis/el/GreekStemFilterFactory.java | 14 +-
.../apache/lucene/analysis/el/GreekStemmer.java | 1091 +++---
.../apache/lucene/analysis/el/package-info.java | 4 +-
.../analysis/email/UAX29URLEmailAnalyzer.java | 59 +-
.../analysis/email/UAX29URLEmailTokenizer.java | 80 +-
.../email/UAX29URLEmailTokenizerFactory.java | 11 +-
.../apache/lucene/analysis/email/package-info.java | 30 +-
.../apache/lucene/analysis/en/EnglishAnalyzer.java | 61 +-
.../analysis/en/EnglishMinimalStemFilter.java | 16 +-
.../en/EnglishMinimalStemFilterFactory.java | 14 +-
.../lucene/analysis/en/EnglishMinimalStemmer.java | 24 +-
.../analysis/en/EnglishPossessiveFilter.java | 20 +-
.../en/EnglishPossessiveFilterFactory.java | 14 +-
.../org/apache/lucene/analysis/en/KStemData1.java | 1412 ++++----
.../org/apache/lucene/analysis/en/KStemData2.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData3.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData4.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData5.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData6.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData7.java | 1411 ++++----
.../org/apache/lucene/analysis/en/KStemData8.java | 1209 ++++---
.../org/apache/lucene/analysis/en/KStemFilter.java | 41 +-
.../lucene/analysis/en/KStemFilterFactory.java | 10 +-
.../org/apache/lucene/analysis/en/KStemmer.java | 1265 ++++---
.../lucene/analysis/en/PorterStemFilter.java | 62 +-
.../analysis/en/PorterStemFilterFactory.java | 11 +-
.../apache/lucene/analysis/en/PorterStemmer.java | 500 +--
.../apache/lucene/analysis/en/package-info.java | 4 +-
.../apache/lucene/analysis/es/SpanishAnalyzer.java | 58 +-
.../lucene/analysis/es/SpanishLightStemFilter.java | 16 +-
.../analysis/es/SpanishLightStemFilterFactory.java | 14 +-
.../lucene/analysis/es/SpanishLightStemmer.java | 92 +-
.../analysis/es/SpanishMinimalStemFilter.java | 13 +-
.../es/SpanishMinimalStemFilterFactory.java | 11 +-
.../lucene/analysis/es/SpanishMinimalStemmer.java | 55 +-
.../apache/lucene/analysis/es/package-info.java | 4 +-
.../lucene/analysis/et/EstonianAnalyzer.java | 155 +-
.../apache/lucene/analysis/et/package-info.java | 4 +-
.../apache/lucene/analysis/eu/BasqueAnalyzer.java | 54 +-
.../apache/lucene/analysis/eu/package-info.java | 4 +-
.../apache/lucene/analysis/fa/PersianAnalyzer.java | 77 +-
.../lucene/analysis/fa/PersianCharFilter.java | 14 +-
.../analysis/fa/PersianCharFilterFactory.java | 7 +-
.../analysis/fa/PersianNormalizationFilter.java | 14 +-
.../fa/PersianNormalizationFilterFactory.java | 14 +-
.../lucene/analysis/fa/PersianNormalizer.java | 53 +-
.../apache/lucene/analysis/fa/package-info.java | 4 +-
.../apache/lucene/analysis/fi/FinnishAnalyzer.java | 57 +-
.../lucene/analysis/fi/FinnishLightStemFilter.java | 16 +-
.../analysis/fi/FinnishLightStemFilterFactory.java | 14 +-
.../lucene/analysis/fi/FinnishLightStemmer.java | 237 +-
.../apache/lucene/analysis/fi/package-info.java | 4 +-
.../apache/lucene/analysis/fr/FrenchAnalyzer.java | 99 +-
.../lucene/analysis/fr/FrenchLightStemFilter.java | 16 +-
.../analysis/fr/FrenchLightStemFilterFactory.java | 12 +-
.../lucene/analysis/fr/FrenchLightStemmer.java | 261 +-
.../analysis/fr/FrenchMinimalStemFilter.java | 16 +-
.../fr/FrenchMinimalStemFilterFactory.java | 14 +-
.../lucene/analysis/fr/FrenchMinimalStemmer.java | 66 +-
.../apache/lucene/analysis/fr/package-info.java | 4 +-
.../apache/lucene/analysis/ga/IrishAnalyzer.java | 79 +-
.../lucene/analysis/ga/IrishLowerCaseFilter.java | 22 +-
.../analysis/ga/IrishLowerCaseFilterFactory.java | 14 +-
.../apache/lucene/analysis/ga/package-info.java | 4 +-
.../lucene/analysis/gl/GalicianAnalyzer.java | 57 +-
.../analysis/gl/GalicianMinimalStemFilter.java | 16 +-
.../gl/GalicianMinimalStemFilterFactory.java | 14 +-
.../lucene/analysis/gl/GalicianMinimalStemmer.java | 17 +-
.../lucene/analysis/gl/GalicianStemFilter.java | 18 +-
.../analysis/gl/GalicianStemFilterFactory.java | 14 +-
.../apache/lucene/analysis/gl/GalicianStemmer.java | 47 +-
.../apache/lucene/analysis/gl/package-info.java | 4 +-
.../apache/lucene/analysis/hi/HindiAnalyzer.java | 65 +-
.../analysis/hi/HindiNormalizationFilter.java | 27 +-
.../hi/HindiNormalizationFilterFactory.java | 16 +-
.../apache/lucene/analysis/hi/HindiNormalizer.java | 271 +-
.../apache/lucene/analysis/hi/HindiStemFilter.java | 12 +-
.../lucene/analysis/hi/HindiStemFilterFactory.java | 16 +-
.../apache/lucene/analysis/hi/HindiStemmer.java | 166 +-
.../apache/lucene/analysis/hi/package-info.java | 4 +-
.../lucene/analysis/hu/HungarianAnalyzer.java | 57 +-
.../analysis/hu/HungarianLightStemFilter.java | 16 +-
.../hu/HungarianLightStemFilterFactory.java | 16 +-
.../lucene/analysis/hu/HungarianLightStemmer.java | 252 +-
.../apache/lucene/analysis/hu/package-info.java | 4 +-
.../lucene/analysis/hunspell/Dictionary.java | 518 +--
.../analysis/hunspell/HunspellStemFilter.java | 96 +-
.../hunspell/HunspellStemFilterFactory.java | 45 +-
.../analysis/hunspell/ISO8859_14Decoder.java | 32 +-
.../apache/lucene/analysis/hunspell/Stemmer.java | 329 +-
.../lucene/analysis/hunspell/package-info.java | 11 +-
.../lucene/analysis/hy/ArmenianAnalyzer.java | 55 +-
.../apache/lucene/analysis/hy/package-info.java | 4 +-
.../lucene/analysis/id/IndonesianAnalyzer.java | 62 +-
.../lucene/analysis/id/IndonesianStemFilter.java | 23 +-
.../analysis/id/IndonesianStemFilterFactory.java | 12 +-
.../lucene/analysis/id/IndonesianStemmer.java | 130 +-
.../apache/lucene/analysis/id/package-info.java | 4 +-
.../analysis/in/IndicNormalizationFilter.java | 9 +-
.../in/IndicNormalizationFilterFactory.java | 16 +-
.../apache/lucene/analysis/in/IndicNormalizer.java | 407 ++-
.../apache/lucene/analysis/in/package-info.java | 4 +-
.../apache/lucene/analysis/it/ItalianAnalyzer.java | 71 +-
.../lucene/analysis/it/ItalianLightStemFilter.java | 16 +-
.../analysis/it/ItalianLightStemFilterFactory.java | 18 +-
.../lucene/analysis/it/ItalianLightStemmer.java | 103 +-
.../apache/lucene/analysis/it/package-info.java | 4 +-
.../lucene/analysis/lt/LithuanianAnalyzer.java | 55 +-
.../apache/lucene/analysis/lt/package-info.java | 4 +-
.../apache/lucene/analysis/lv/LatvianAnalyzer.java | 57 +-
.../lucene/analysis/lv/LatvianStemFilter.java | 16 +-
.../analysis/lv/LatvianStemFilterFactory.java | 16 +-
.../apache/lucene/analysis/lv/LatvianStemmer.java | 155 +-
.../apache/lucene/analysis/lv/package-info.java | 4 +-
.../lucene/analysis/minhash/MinHashFilter.java | 73 +-
.../analysis/minhash/MinHashFilterFactory.java | 17 +-
.../lucene/analysis/minhash/package-info.java | 4 +-
.../analysis/miscellaneous/ASCIIFoldingFilter.java | 134 +-
.../miscellaneous/ASCIIFoldingFilterFactory.java | 15 +-
.../miscellaneous/CapitalizationFilter.java | 73 +-
.../miscellaneous/CapitalizationFilterFactory.java | 52 +-
.../miscellaneous/CodepointCountFilter.java | 21 +-
.../miscellaneous/CodepointCountFilterFactory.java | 10 +-
.../miscellaneous/ConcatenateGraphFilter.java | 150 +-
.../ConcatenateGraphFilterFactory.java | 59 +-
.../miscellaneous/ConcatenatingTokenStream.java | 30 +-
.../miscellaneous/ConditionalTokenFilter.java | 40 +-
.../ConditionalTokenFilterFactory.java | 41 +-
.../miscellaneous/DateRecognizerFilter.java | 11 +-
.../miscellaneous/DateRecognizerFilterFactory.java | 21 +-
.../DelimitedTermFrequencyTokenFilter.java | 23 +-
.../DelimitedTermFrequencyTokenFilterFactory.java | 8 +-
.../miscellaneous/DropIfFlaggedFilter.java | 1 -
.../miscellaneous/DropIfFlaggedFilterFactory.java | 21 +-
.../analysis/miscellaneous/EmptyTokenStream.java | 6 +-
.../analysis/miscellaneous/FingerprintFilter.java | 93 +-
.../miscellaneous/FingerprintFilterFactory.java | 19 +-
.../miscellaneous/FixBrokenOffsetsFilter.java | 3 +-
.../FixBrokenOffsetsFilterFactory.java | 6 +-
.../miscellaneous/HyphenatedWordsFilter.java | 34 +-
.../HyphenatedWordsFilterFactory.java | 13 +-
.../analysis/miscellaneous/KeepWordFilter.java | 16 +-
.../miscellaneous/KeepWordFilterFactory.java | 11 +-
.../miscellaneous/KeywordMarkerFilter.java | 10 +-
.../miscellaneous/KeywordMarkerFilterFactory.java | 21 +-
.../miscellaneous/KeywordRepeatFilter.java | 20 +-
.../miscellaneous/KeywordRepeatFilterFactory.java | 17 +-
.../analysis/miscellaneous/LengthFilter.java | 21 +-
.../miscellaneous/LengthFilterFactory.java | 11 +-
.../miscellaneous/LimitTokenCountAnalyzer.java | 33 +-
.../miscellaneous/LimitTokenCountFilter.java | 38 +-
.../LimitTokenCountFilterFactory.java | 15 +-
.../miscellaneous/LimitTokenOffsetFilter.java | 23 +-
.../LimitTokenOffsetFilterFactory.java | 10 +-
.../miscellaneous/LimitTokenPositionFilter.java | 43 +-
.../LimitTokenPositionFilterFactory.java | 16 +-
.../miscellaneous/PatternKeywordMarkerFilter.java | 27 +-
.../miscellaneous/PerFieldAnalyzerWrapper.java | 57 +-
.../miscellaneous/ProtectedTermFilter.java | 18 +-
.../miscellaneous/ProtectedTermFilterFactory.java | 72 +-
.../miscellaneous/RemoveDuplicatesTokenFilter.java | 23 +-
.../RemoveDuplicatesTokenFilterFactory.java | 11 +-
.../miscellaneous/ScandinavianFoldingFilter.java | 81 +-
.../ScandinavianFoldingFilterFactory.java | 8 +-
.../ScandinavianNormalizationFilter.java | 55 +-
.../ScandinavianNormalizationFilterFactory.java | 6 +-
.../miscellaneous/SetKeywordMarkerFilter.java | 23 +-
.../miscellaneous/StemmerOverrideFilter.java | 95 +-
.../StemmerOverrideFilterFactory.java | 11 +-
.../lucene/analysis/miscellaneous/TrimFilter.java | 23 +-
.../analysis/miscellaneous/TrimFilterFactory.java | 12 +-
.../miscellaneous/TruncateTokenFilter.java | 16 +-
.../miscellaneous/TruncateTokenFilterFactory.java | 14 +-
.../miscellaneous/TypeAsSynonymFilter.java | 28 +-
.../miscellaneous/TypeAsSynonymFilterFactory.java | 24 +-
.../miscellaneous/WordDelimiterFilter.java | 267 +-
.../miscellaneous/WordDelimiterFilterFactory.java | 134 +-
.../miscellaneous/WordDelimiterGraphFilter.java | 325 +-
.../WordDelimiterGraphFilterFactory.java | 135 +-
.../miscellaneous/WordDelimiterIterator.java | 184 +-
.../analysis/miscellaneous/package-info.java | 6 +-
.../analysis/ngram/EdgeNGramFilterFactory.java | 8 +-
.../analysis/ngram/EdgeNGramTokenFilter.java | 42 +-
.../lucene/analysis/ngram/EdgeNGramTokenizer.java | 12 +-
.../analysis/ngram/EdgeNGramTokenizerFactory.java | 7 +-
.../lucene/analysis/ngram/NGramFilterFactory.java | 8 +-
.../lucene/analysis/ngram/NGramTokenFilter.java | 71 +-
.../lucene/analysis/ngram/NGramTokenizer.java | 64 +-
.../analysis/ngram/NGramTokenizerFactory.java | 14 +-
.../apache/lucene/analysis/ngram/package-info.java | 4 +-
.../apache/lucene/analysis/nl/DutchAnalyzer.java | 92 +-
.../apache/lucene/analysis/nl/package-info.java | 4 +-
.../lucene/analysis/no/NorwegianAnalyzer.java | 58 +-
.../analysis/no/NorwegianLightStemFilter.java | 33 +-
.../no/NorwegianLightStemFilterFactory.java | 20 +-
.../lucene/analysis/no/NorwegianLightStemmer.java | 224 +-
.../analysis/no/NorwegianMinimalStemFilter.java | 31 +-
.../no/NorwegianMinimalStemFilterFactory.java | 20 +-
.../analysis/no/NorwegianMinimalStemmer.java | 95 +-
.../apache/lucene/analysis/no/package-info.java | 4 +-
.../analysis/path/PathHierarchyTokenizer.java | 76 +-
.../path/PathHierarchyTokenizerFactory.java | 48 +-
.../path/ReversePathHierarchyTokenizer.java | 45 +-
.../apache/lucene/analysis/path/package-info.java | 4 +-
.../pattern/PatternCaptureGroupFilterFactory.java | 18 +-
.../pattern/PatternCaptureGroupTokenFilter.java | 86 +-
.../analysis/pattern/PatternReplaceCharFilter.java | 49 +-
.../pattern/PatternReplaceCharFilterFactory.java | 9 +-
.../analysis/pattern/PatternReplaceFilter.java | 36 +-
.../pattern/PatternReplaceFilterFactory.java | 12 +-
.../lucene/analysis/pattern/PatternTokenizer.java | 61 +-
.../analysis/pattern/PatternTokenizerFactory.java | 52 +-
.../pattern/SimplePatternSplitTokenizer.java | 43 +-
.../SimplePatternSplitTokenizerFactory.java | 45 +-
.../analysis/pattern/SimplePatternTokenizer.java | 55 +-
.../pattern/SimplePatternTokenizerFactory.java | 40 +-
.../lucene/analysis/pattern/package-info.java | 4 +-
.../lucene/analysis/payloads/AbstractEncoder.java | 8 +-
.../payloads/DelimitedPayloadTokenFilter.java | 24 +-
.../DelimitedPayloadTokenFilterFactory.java | 18 +-
.../lucene/analysis/payloads/FloatEncoder.java | 8 +-
.../lucene/analysis/payloads/IdentityEncoder.java | 14 +-
.../lucene/analysis/payloads/IntegerEncoder.java | 13 +-
.../payloads/NumericPayloadTokenFilter.java | 13 +-
.../payloads/NumericPayloadTokenFilterFactory.java | 13 +-
.../lucene/analysis/payloads/PayloadEncoder.java | 13 +-
.../lucene/analysis/payloads/PayloadHelper.java | 36 +-
.../payloads/TokenOffsetPayloadTokenFilter.java | 15 +-
.../TokenOffsetPayloadTokenFilterFactory.java | 14 +-
.../payloads/TypeAsPayloadTokenFilter.java | 11 +-
.../payloads/TypeAsPayloadTokenFilterFactory.java | 14 +-
.../lucene/analysis/payloads/package-info.java | 4 +-
.../lucene/analysis/pt/PortugueseAnalyzer.java | 58 +-
.../analysis/pt/PortugueseLightStemFilter.java | 16 +-
.../pt/PortugueseLightStemFilterFactory.java | 14 +-
.../lucene/analysis/pt/PortugueseLightStemmer.java | 162 +-
.../analysis/pt/PortugueseMinimalStemFilter.java | 16 +-
.../pt/PortugueseMinimalStemFilterFactory.java | 14 +-
.../analysis/pt/PortugueseMinimalStemmer.java | 22 +-
.../lucene/analysis/pt/PortugueseStemFilter.java | 18 +-
.../analysis/pt/PortugueseStemFilterFactory.java | 16 +-
.../lucene/analysis/pt/PortugueseStemmer.java | 65 +-
.../apache/lucene/analysis/pt/RSLPStemmerBase.java | 262 +-
.../apache/lucene/analysis/pt/package-info.java | 4 +-
.../analysis/query/QueryAutoStopWordAnalyzer.java | 90 +-
.../apache/lucene/analysis/query/package-info.java | 4 +-
.../analysis/reverse/ReverseStringFilter.java | 106 +-
.../reverse/ReverseStringFilterFactory.java | 12 +-
.../lucene/analysis/reverse/package-info.java | 4 +-
.../lucene/analysis/ro/RomanianAnalyzer.java | 61 +-
.../apache/lucene/analysis/ro/package-info.java | 4 +-
.../apache/lucene/analysis/ru/RussianAnalyzer.java | 159 +-
.../lucene/analysis/ru/RussianLightStemFilter.java | 16 +-
.../analysis/ru/RussianLightStemFilterFactory.java | 14 +-
.../lucene/analysis/ru/RussianLightStemmer.java | 168 +-
.../apache/lucene/analysis/ru/package-info.java | 4 +-
.../analysis/shingle/FixedShingleFilter.java | 45 +-
.../shingle/FixedShingleFilterFactory.java | 6 +-
.../analysis/shingle/ShingleAnalyzerWrapper.java | 52 +-
.../lucene/analysis/shingle/ShingleFilter.java | 295 +-
.../analysis/shingle/ShingleFilterFactory.java | 24 +-
.../lucene/analysis/shingle/package-info.java | 4 +-
.../lucene/analysis/sinks/TeeSinkTokenFilter.java | 44 +-
.../apache/lucene/analysis/sinks/package-info.java | 4 +-
.../lucene/analysis/snowball/SnowballFilter.java | 42 +-
.../snowball/SnowballPorterFilterFactory.java | 20 +-
.../lucene/analysis/snowball/package-info.java | 53 +-
.../apache/lucene/analysis/sr/SerbianAnalyzer.java | 49 +-
.../analysis/sr/SerbianNormalizationFilter.java | 243 +-
.../sr/SerbianNormalizationFilterFactory.java | 16 +-
.../sr/SerbianNormalizationRegularFilter.java | 228 +-
.../apache/lucene/analysis/sr/package-info.java | 4 +-
.../apache/lucene/analysis/sv/SwedishAnalyzer.java | 57 +-
.../lucene/analysis/sv/SwedishLightStemFilter.java | 16 +-
.../analysis/sv/SwedishLightStemFilterFactory.java | 14 +-
.../lucene/analysis/sv/SwedishLightStemmer.java | 112 +-
.../apache/lucene/analysis/sv/package-info.java | 4 +-
.../lucene/analysis/synonym/SolrSynonymParser.java | 79 +-
.../lucene/analysis/synonym/SynonymFilter.java | 193 +-
.../analysis/synonym/SynonymFilterFactory.java | 114 +-
.../analysis/synonym/SynonymGraphFilter.java | 185 +-
.../synonym/SynonymGraphFilterFactory.java | 110 +-
.../apache/lucene/analysis/synonym/SynonymMap.java | 128 +-
.../analysis/synonym/WordnetSynonymParser.java | 28 +-
.../lucene/analysis/synonym/package-info.java | 4 +-
.../apache/lucene/analysis/th/ThaiAnalyzer.java | 44 +-
.../apache/lucene/analysis/th/ThaiTokenizer.java | 45 +-
.../lucene/analysis/th/ThaiTokenizerFactory.java | 12 +-
.../apache/lucene/analysis/th/package-info.java | 4 +-
.../lucene/analysis/tr/ApostropheFilter.java | 20 +-
.../analysis/tr/ApostropheFilterFactory.java | 8 +-
.../apache/lucene/analysis/tr/TurkishAnalyzer.java | 54 +-
.../lucene/analysis/tr/TurkishLowerCaseFilter.java | 74 +-
.../analysis/tr/TurkishLowerCaseFilterFactory.java | 14 +-
.../apache/lucene/analysis/tr/package-info.java | 4 +-
.../lucene/analysis/util/CharArrayIterator.java | 45 +-
.../apache/lucene/analysis/util/CharTokenizer.java | 127 +-
.../apache/lucene/analysis/util/ElisionFilter.java | 15 +-
.../lucene/analysis/util/ElisionFilterFactory.java | 12 +-
.../analysis/util/FilesystemResourceLoader.java | 55 +-
.../lucene/analysis/util/OpenStringBuilder.java | 55 +-
.../lucene/analysis/util/RollingCharBuffer.java | 46 +-
.../analysis/util/SegmentingTokenizerBase.java | 97 +-
.../apache/lucene/analysis/util/StemmerUtil.java | 44 +-
.../apache/lucene/analysis/util/UnicodeProps.java | 24 +-
.../apache/lucene/analysis/util/package-info.java | 4 +-
.../analysis/wikipedia/WikipediaTokenizer.java | 218 +-
.../wikipedia/WikipediaTokenizerFactory.java | 6 +-
.../analysis/wikipedia/WikipediaTokenizerImpl.java | 1444 ++++----
.../lucene/analysis/wikipedia/package-info.java | 4 +-
.../lucene/analysis/bg/TestBulgarianAnalyzer.java | 37 +-
.../bg/TestBulgarianStemFilterFactory.java | 25 +-
.../lucene/analysis/bg/TestBulgarianStemmer.java | 103 +-
.../lucene/analysis/bn/TestBengaliAnalyzer.java | 12 +-
.../lucene/analysis/bn/TestBengaliFilters.java | 59 +-
.../lucene/analysis/bn/TestBengaliNormalizer.java | 40 +-
.../lucene/analysis/bn/TestBengaliStemmer.java | 31 +-
.../boost/DelimitedBoostTokenFilterTest.java | 22 +-
.../lucene/analysis/br/TestBrazilianAnalyzer.java | 211 +-
.../br/TestBrazilianStemFilterFactory.java | 25 +-
.../lucene/analysis/ca/TestCatalanAnalyzer.java | 21 +-
.../charfilter/HTMLStripCharFilterTest.java | 621 ++--
.../charfilter/TestHTMLStripCharFilterFactory.java | 83 +-
.../analysis/charfilter/TestMappingCharFilter.java | 286 +-
.../charfilter/TestMappingCharFilterFactory.java | 52 +-
.../lucene/analysis/cjk/TestCJKAnalyzer.java | 476 +--
.../lucene/analysis/cjk/TestCJKBigramFilter.java | 287 +-
.../analysis/cjk/TestCJKBigramFilterFactory.java | 51 +-
.../analysis/cjk/TestCJKWidthCharFilter.java | 78 +-
.../cjk/TestCJKWidthCharFilterFactory.java | 20 +-
.../lucene/analysis/cjk/TestCJKWidthFilter.java | 73 +-
.../analysis/cjk/TestCJKWidthFilterFactory.java | 21 +-
.../lucene/analysis/ckb/TestSoraniAnalyzer.java | 32 +-
.../ckb/TestSoraniNormalizationFilter.java | 58 +-
.../ckb/TestSoraniNormalizationFilterFactory.java | 23 +-
.../lucene/analysis/ckb/TestSoraniStemFilter.java | 68 +-
.../analysis/ckb/TestSoraniStemFilterFactory.java | 25 +-
.../analysis/classic/TestClassicAnalyzer.java | 201 +-
.../analysis/classic/TestClassicFactories.java | 54 +-
.../commongrams/CommonGramsFilterTest.java | 390 +--
.../commongrams/TestCommonGramsFilterFactory.java | 81 +-
.../TestCommonGramsQueryFilterFactory.java | 90 +-
.../compound/TestCompoundWordTokenFilter.java | 557 ++--
...stDictionaryCompoundWordTokenFilterFactory.java | 41 +-
...tHyphenationCompoundWordTokenFilterFactory.java | 80 +-
.../core/TestAllAnalyzersHaveFactories.java | 148 +-
.../apache/lucene/analysis/core/TestAnalyzers.java | 162 +-
.../lucene/analysis/core/TestBugInSomething.java | 323 +-
.../lucene/analysis/core/TestCoreFactories.java | 55 +-
.../analysis/core/TestDecimalDigitFilter.java | 121 +-
.../core/TestDecimalDigitFilterFactory.java | 29 +-
.../lucene/analysis/core/TestDuelingAnalyzers.java | 181 +-
.../apache/lucene/analysis/core/TestFactories.java | 73 +-
.../analysis/core/TestFlattenGraphFilter.java | 372 ++-
.../lucene/analysis/core/TestKeywordAnalyzer.java | 26 +-
.../lucene/analysis/core/TestKeywordTokenizer.java | 37 +-
.../lucene/analysis/core/TestRandomChains.java | 776 +++--
.../lucene/analysis/core/TestStopAnalyzer.java | 27 +-
.../analysis/core/TestStopFilterFactory.java | 61 +-
.../lucene/analysis/core/TestTypeTokenFilter.java | 28 +-
.../analysis/core/TestTypeTokenFilterFactory.java | 52 +-
.../core/TestUnicodeWhitespaceTokenizer.java | 54 +-
.../analysis/core/TestWhitespaceAnalyzer.java | 22 +-
.../lucene/analysis/custom/TestCustomAnalyzer.java | 552 ++--
.../lucene/analysis/cz/TestCzechAnalyzer.java | 27 +-
.../analysis/cz/TestCzechStemFilterFactory.java | 27 +-
.../lucene/analysis/cz/TestCzechStemmer.java | 410 ++-
.../lucene/analysis/da/TestDanishAnalyzer.java | 16 +-
.../lucene/analysis/de/TestGermanAnalyzer.java | 25 +-
.../analysis/de/TestGermanLightStemFilter.java | 69 +-
.../de/TestGermanLightStemFilterFactory.java | 21 +-
.../analysis/de/TestGermanMinimalStemFilter.java | 71 +-
.../de/TestGermanMinimalStemFilterFactory.java | 23 +-
.../analysis/de/TestGermanNormalizationFilter.java | 62 +-
.../de/TestGermanNormalizationFilterFactory.java | 23 +-
.../lucene/analysis/de/TestGermanStemFilter.java | 72 +-
.../analysis/de/TestGermanStemFilterFactory.java | 27 +-
.../lucene/analysis/el/GreekAnalyzerTest.java | 45 +-
.../el/TestGreekLowerCaseFilterFactory.java | 25 +-
.../analysis/el/TestGreekStemFilterFactory.java | 21 +-
.../lucene/analysis/el/TestGreekStemmer.java | 186 +-
.../analysis/email/TestUAX29URLEmailAnalyzer.java | 657 ++--
.../analysis/email/TestUAX29URLEmailTokenizer.java | 2426 +++++++++++---
.../email/TestUAX29URLEmailTokenizerFactory.java | 201 +-
.../lucene/analysis/en/TestEnglishAnalyzer.java | 16 +-
.../analysis/en/TestEnglishMinimalStemFilter.java | 46 +-
.../en/TestEnglishMinimalStemFilterFactory.java | 21 +-
.../lucene/analysis/en/TestKStemFilterFactory.java | 21 +-
.../apache/lucene/analysis/en/TestKStemmer.java | 113 +-
.../lucene/analysis/en/TestPorterStemFilter.java | 58 +-
.../analysis/en/TestPorterStemFilterFactory.java | 25 +-
.../lucene/analysis/es/TestSpanishAnalyzer.java | 16 +-
.../analysis/es/TestSpanishLightStemFilter.java | 48 +-
.../es/TestSpanishLightStemFilterFactory.java | 23 +-
.../analysis/es/TestSpanishMinimalStemFilter.java | 38 +-
.../es/TestSpanishMinimalStemFilterFactory.java | 20 +-
.../lucene/analysis/et/TestEstonianAnalyzer.java | 69 +-
.../lucene/analysis/eu/TestBasqueAnalyzer.java | 18 +-
.../lucene/analysis/fa/TestPersianAnalyzer.java | 200 +-
.../lucene/analysis/fa/TestPersianCharFilter.java | 34 +-
.../fa/TestPersianNormalizationFilter.java | 30 +-
.../fa/TestPersianNormalizationFilterFactory.java | 25 +-
.../lucene/analysis/fi/TestFinnishAnalyzer.java | 16 +-
.../analysis/fi/TestFinnishLightStemFilter.java | 69 +-
.../fi/TestFinnishLightStemFilterFactory.java | 23 +-
.../lucene/analysis/fr/TestFrenchAnalyzer.java | 142 +-
.../analysis/fr/TestFrenchLightStemFilter.java | 147 +-
.../fr/TestFrenchLightStemFilterFactory.java | 21 +-
.../analysis/fr/TestFrenchMinimalStemFilter.java | 73 +-
.../fr/TestFrenchMinimalStemFilterFactory.java | 21 +-
.../lucene/analysis/ga/TestIrishAnalyzer.java | 29 +-
.../analysis/ga/TestIrishLowerCaseFilter.java | 35 +-
.../ga/TestIrishLowerCaseFilterFactory.java | 21 +-
.../lucene/analysis/gl/TestGalicianAnalyzer.java | 16 +-
.../analysis/gl/TestGalicianMinimalStemFilter.java | 69 +-
.../gl/TestGalicianMinimalStemFilterFactory.java | 21 +-
.../lucene/analysis/gl/TestGalicianStemFilter.java | 44 +-
.../analysis/gl/TestGalicianStemFilterFactory.java | 21 +-
.../lucene/analysis/hi/TestHindiAnalyzer.java | 25 +-
.../lucene/analysis/hi/TestHindiFilters.java | 59 +-
.../lucene/analysis/hi/TestHindiNormalizer.java | 31 +-
.../lucene/analysis/hi/TestHindiStemmer.java | 58 +-
.../lucene/analysis/hu/TestHungarianAnalyzer.java | 16 +-
.../analysis/hu/TestHungarianLightStemFilter.java | 67 +-
.../hu/TestHungarianLightStemFilterFactory.java | 21 +-
.../lucene/analysis/hunspell/StemmerTestBase.java | 34 +-
.../lucene/analysis/hunspell/Test64kAffixes.java | 18 +-
.../analysis/hunspell/TestAllDictionaries.java | 308 +-
.../analysis/hunspell/TestAllDictionaries2.java | 424 ++-
.../analysis/hunspell/TestAlternateCasing.java | 35 +-
.../analysis/hunspell/TestCaseInsensitive.java | 5 +-
.../analysis/hunspell/TestCaseSensitive.java | 41 +-
.../lucene/analysis/hunspell/TestCircumfix.java | 5 +-
.../analysis/hunspell/TestComplexPrefix.java | 3 +-
.../lucene/analysis/hunspell/TestCondition.java | 3 +-
.../lucene/analysis/hunspell/TestCondition2.java | 5 +-
.../apache/lucene/analysis/hunspell/TestConv.java | 5 +-
.../lucene/analysis/hunspell/TestDependencies.java | 5 +-
.../lucene/analysis/hunspell/TestDictionary.java | 148 +-
.../lucene/analysis/hunspell/TestDoubleEscape.java | 3 +-
.../lucene/analysis/hunspell/TestEscaped.java | 3 +-
.../lucene/analysis/hunspell/TestFlagLong.java | 3 +-
.../lucene/analysis/hunspell/TestFlagNum.java | 3 +-
.../lucene/analysis/hunspell/TestFullStrip.java | 3 +-
.../lucene/analysis/hunspell/TestHomonyms.java | 5 +-
.../analysis/hunspell/TestHunspellStemFilter.java | 82 +-
.../hunspell/TestHunspellStemFilterFactory.java | 30 +-
.../lucene/analysis/hunspell/TestIgnore.java | 5 +-
.../lucene/analysis/hunspell/TestKeepCase.java | 23 +-
.../apache/lucene/analysis/hunspell/TestMorph.java | 5 +-
.../lucene/analysis/hunspell/TestMorphAlias.java | 9 +-
.../lucene/analysis/hunspell/TestMorphData.java | 9 +-
.../lucene/analysis/hunspell/TestNeedAffix.java | 15 +-
.../analysis/hunspell/TestOnlyInCompound.java | 9 +-
.../analysis/hunspell/TestOptionalCondition.java | 3 +-
.../lucene/analysis/hunspell/TestSpaces.java | 5 +-
.../lucene/analysis/hunspell/TestStemmer.java | 5 +-
.../hunspell/TestStrangeOvergeneration.java | 3 +-
.../lucene/analysis/hunspell/TestTwoFold.java | 5 +-
.../lucene/analysis/hunspell/TestTwoSuffixes.java | 5 +-
.../lucene/analysis/hunspell/TestZeroAffix.java | 3 +-
.../lucene/analysis/hunspell/TestZeroAffix2.java | 5 +-
.../lucene/analysis/hy/TestArmenianAnalyzer.java | 18 +-
.../lucene/analysis/id/TestIndonesianAnalyzer.java | 16 +-
.../id/TestIndonesianStemFilterFactory.java | 37 +-
.../lucene/analysis/id/TestIndonesianStemmer.java | 75 +-
.../lucene/analysis/in/TestIndicNormalizer.java | 34 +-
.../lucene/analysis/it/TestItalianAnalyzer.java | 22 +-
.../analysis/it/TestItalianLightStemFilter.java | 48 +-
.../it/TestItalianLightStemFilterFactory.java | 21 +-
.../lucene/analysis/lt/TestLithuanianAnalyzer.java | 18 +-
.../lucene/analysis/lt/TestLithuanianStemming.java | 773 +++--
.../lucene/analysis/lv/TestLatvianAnalyzer.java | 16 +-
.../analysis/lv/TestLatvianStemFilterFactory.java | 23 +-
.../lucene/analysis/lv/TestLatvianStemmer.java | 330 +-
.../lucene/analysis/minhash/MinHashFilterTest.java | 133 +-
.../DateRecognizerFilterFactoryTest.java | 20 +-
.../miscellaneous/DateRecognizerFilterTest.java | 9 +-
.../DelimitedTermFrequencyTokenFilterTest.java | 11 +-
.../miscellaneous/TestASCIIFoldingFilter.java | 3475 ++++++++++----------
.../TestAsciiFoldingFilterFactory.java | 18 +-
.../miscellaneous/TestCapitalizationFilter.java | 366 ++-
.../TestCapitalizationFilterFactory.java | 425 ++-
.../miscellaneous/TestCodepointCountFilter.java | 37 +-
.../TestCodepointCountFilterFactory.java | 55 +-
.../miscellaneous/TestConcatenateGraphFilter.java | 72 +-
.../TestConcatenateGraphFilterFactory.java | 64 +-
.../TestConcatenatingTokenStream.java | 73 +-
.../miscellaneous/TestConditionalTokenFilter.java | 276 +-
.../miscellaneous/TestDropIfFlaggedFilter.java | 39 +-
.../TestDropIfFlaggedFilterFactory.java | 18 +-
.../miscellaneous/TestEmptyTokenStream.java | 11 +-
.../miscellaneous/TestFingerprintFilter.java | 26 +-
.../TestFingerprintFilterFactory.java | 42 +-
.../miscellaneous/TestFixBrokenOffsetsFilter.java | 7 +-
.../miscellaneous/TestHyphenatedWordsFilter.java | 71 +-
.../miscellaneous/TestKeepFilterFactory.java | 26 +-
.../analysis/miscellaneous/TestKeepWordFilter.java | 49 +-
.../miscellaneous/TestKeywordMarkerFilter.java | 122 +-
.../TestKeywordMarkerFilterFactory.java | 108 +-
.../miscellaneous/TestKeywordRepeatFilter.java | 33 +-
.../analysis/miscellaneous/TestLengthFilter.java | 31 +-
.../miscellaneous/TestLengthFilterFactory.java | 58 +-
.../miscellaneous/TestLimitTokenCountAnalyzer.java | 55 +-
.../miscellaneous/TestLimitTokenCountFilter.java | 5 +-
.../TestLimitTokenCountFilterFactory.java | 56 +-
.../miscellaneous/TestLimitTokenOffsetFilter.java | 7 +-
.../TestLimitTokenOffsetFilterFactory.java | 54 +-
.../TestLimitTokenPositionFilter.java | 91 +-
.../TestLimitTokenPositionFilterFactory.java | 75 +-
.../miscellaneous/TestMiscellaneousFactories.java | 29 +-
.../miscellaneous/TestPerFieldAnalyzerWrapper.java | 108 +-
.../miscellaneous/TestProtectedTermFilter.java | 17 +-
.../TestProtectedTermFilterFactory.java | 207 +-
.../TestRemoveDuplicatesTokenFilter.java | 191 +-
.../TestRemoveDuplicatesTokenFilterFactory.java | 36 +-
.../TestScandinavianFoldingFilter.java | 45 +-
.../TestScandinavianFoldingFilterFactory.java | 19 +-
.../TestScandinavianNormalizationFilter.java | 46 +-
...TestScandinavianNormalizationFilterFactory.java | 16 +-
.../miscellaneous/TestStemmerOverrideFilter.java | 62 +-
.../TestStemmerOverrideFilterFactory.java | 52 +-
.../analysis/miscellaneous/TestTrimFilter.java | 52 +-
.../miscellaneous/TestTrimFilterFactory.java | 21 +-
.../miscellaneous/TestTruncateTokenFilter.java | 8 +-
.../TestTruncateTokenFilterFactory.java | 68 +-
.../miscellaneous/TestTypeAsSynonymFilter.java | 87 +-
.../TestTypeAsSynonymFilterFactory.java | 37 +-
.../miscellaneous/TestWordDelimiterFilter.java | 651 ++--
.../TestWordDelimiterGraphFilter.java | 1062 +++---
.../analysis/ngram/EdgeNGramTokenFilterTest.java | 160 +-
.../analysis/ngram/EdgeNGramTokenizerTest.java | 116 +-
.../analysis/ngram/NGramTokenFilterTest.java | 224 +-
.../lucene/analysis/ngram/NGramTokenizerTest.java | 131 +-
.../lucene/analysis/ngram/TestNGramFilters.java | 180 +-
.../lucene/analysis/nl/TestDutchAnalyzer.java | 210 +-
.../lucene/analysis/no/TestNorwegianAnalyzer.java | 16 +-
.../analysis/no/TestNorwegianLightStemFilter.java | 90 +-
.../no/TestNorwegianLightStemFilterFactory.java | 23 +-
.../no/TestNorwegianMinimalStemFilter.java | 92 +-
.../no/TestNorwegianMinimalStemFilterFactory.java | 35 +-
.../analysis/path/TestPathHierarchyTokenizer.java | 264 +-
.../path/TestReversePathHierarchyTokenizer.java | 219 +-
.../TestPatternCaptureGroupTokenFilter.java | 554 ++--
.../pattern/TestPatternReplaceCharFilter.java | 242 +-
.../TestPatternReplaceCharFilterFactory.java | 84 +-
.../analysis/pattern/TestPatternReplaceFilter.java | 115 +-
.../pattern/TestPatternReplaceFilterFactory.java | 34 +-
.../analysis/pattern/TestPatternTokenizer.java | 125 +-
.../pattern/TestPatternTokenizerFactory.java | 23 +-
.../pattern/TestSimplePatternSplitTokenizer.java | 135 +-
.../pattern/TestSimplePatternTokenizer.java | 97 +-
.../payloads/DelimitedPayloadTokenFilterTest.java | 52 +-
.../payloads/NumericPayloadTokenFilterTest.java | 33 +-
.../TestDelimitedPayloadTokenFilterFactory.java | 27 +-
.../TokenOffsetPayloadTokenFilterTest.java | 16 +-
.../payloads/TypeAsPayloadTokenFilterTest.java | 20 +-
.../lucene/analysis/pt/TestPortugueseAnalyzer.java | 16 +-
.../analysis/pt/TestPortugueseLightStemFilter.java | 166 +-
.../pt/TestPortugueseLightStemFilterFactory.java | 23 +-
.../pt/TestPortugueseMinimalStemFilter.java | 120 +-
.../pt/TestPortugueseMinimalStemFilterFactory.java | 23 +-
.../analysis/pt/TestPortugueseStemFilter.java | 120 +-
.../pt/TestPortugueseStemFilterFactory.java | 21 +-
.../query/QueryAutoStopWordAnalyzerTest.java | 65 +-
.../analysis/reverse/TestReverseStringFilter.java | 67 +-
.../reverse/TestReverseStringFilterFactory.java | 25 +-
.../lucene/analysis/ro/TestRomanianAnalyzer.java | 16 +-
.../lucene/analysis/ru/TestRussianAnalyzer.java | 41 +-
.../analysis/ru/TestRussianLightStemFilter.java | 69 +-
.../ru/TestRussianLightStemFilterFactory.java | 21 +-
.../analysis/shingle/FixedShingleFilterTest.java | 362 +-
.../shingle/ShingleAnalyzerWrapperTest.java | 551 ++--
.../lucene/analysis/shingle/ShingleFilterTest.java | 2240 +++++++------
.../analysis/shingle/TestShingleFilterFactory.java | 238 +-
.../analysis/sinks/TestTeeSinkTokenFilter.java | 74 +-
.../lucene/analysis/snowball/TestSnowball.java | 86 +-
.../snowball/TestSnowballPorterFilterFactory.java | 43 +-
.../analysis/snowball/TestSnowballVocab.java | 37 +-
.../lucene/analysis/sr/TestSerbianAnalyzer.java | 16 +-
.../sr/TestSerbianNormalizationFilter.java | 57 +-
.../sr/TestSerbianNormalizationFilterFactory.java | 28 +-
.../sr/TestSerbianNormalizationRegularFilter.java | 58 +-
.../lucene/analysis/sv/TestSwedishAnalyzer.java | 16 +-
.../analysis/sv/TestSwedishLightStemFilter.java | 69 +-
.../sv/TestSwedishLightStemFilterFactory.java | 21 +-
.../synonym/BaseSynonymParserTestCase.java | 61 +-
.../analysis/synonym/TestMultiWordSynonyms.java | 27 +-
.../analysis/synonym/TestSolrSynonymParser.java | 232 +-
.../analysis/synonym/TestSynonymFilterFactory.java | 148 +-
.../analysis/synonym/TestSynonymGraphFilter.java | 1330 ++++----
.../analysis/synonym/TestSynonymMapFilter.java | 628 ++--
.../analysis/synonym/TestWordnetSynonymParser.java | 70 +-
.../lucene/analysis/th/TestThaiAnalyzer.java | 110 +-
.../analysis/th/TestThaiTokenizerFactory.java | 33 +-
.../lucene/analysis/tr/TestApostropheFilter.java | 4 +-
.../analysis/tr/TestApostropheFilterFactory.java | 32 +-
.../lucene/analysis/tr/TestTurkishAnalyzer.java | 13 +-
.../analysis/tr/TestTurkishLowerCaseFilter.java | 72 +-
.../tr/TestTurkishLowerCaseFilterFactory.java | 27 +-
.../analysis/util/StringMockResourceLoader.java | 6 +-
.../analysis/util/TestCharArrayIterator.java | 46 +-
.../lucene/analysis/util/TestCharTokenizers.java | 171 +-
.../apache/lucene/analysis/util/TestElision.java | 25 +-
.../analysis/util/TestElisionFilterFactory.java | 51 +-
.../util/TestFilesystemResourceLoader.java | 61 +-
.../analysis/util/TestRollingCharBuffer.java | 17 +-
.../analysis/util/TestSegmentingTokenizerBase.java | 177 +-
.../wikipedia/TestWikipediaTokenizerFactory.java | 173 +-
.../analysis/wikipedia/WikipediaTokenizerTest.java | 521 ++-
.../analysis/standard/GenerateJflexTLDMacros.java | 177 +-
.../apache/lucene/analysis/LowerCaseFilter.java | 4 +-
.../tokenattributes/PackedTokenAttributeImpl.java | 4 +-
.../org/apache/lucene/search/PhrasePositions.java | 4 +-
.../org/apache/lucene/util/AttributeSource.java | 4 +-
.../java/org/apache/lucene/util/UnicodeUtil.java | 8 +-
.../apache/lucene/util/automaton/StatePair.java | 4 +-
.../apache/lucene/queries/CommonTermsQuery.java | 249 +-
.../queries/function/FunctionMatchQuery.java | 51 +-
.../lucene/queries/function/FunctionQuery.java | 42 +-
.../queries/function/FunctionRangeQuery.java | 79 +-
.../queries/function/FunctionScoreQuery.java | 140 +-
.../lucene/queries/function/FunctionValues.java | 154 +-
.../queries/function/IndexReaderFunctions.java | 27 +-
.../lucene/queries/function/ValueSource.java | 88 +-
.../lucene/queries/function/ValueSourceScorer.java | 55 +-
.../queries/function/docvalues/BoolDocValues.java | 11 +-
.../function/docvalues/DocTermsIndexDocValues.java | 37 +-
.../function/docvalues/DoubleDocValues.java | 40 +-
.../queries/function/docvalues/FloatDocValues.java | 11 +-
.../queries/function/docvalues/IntDocValues.java | 27 +-
.../queries/function/docvalues/LongDocValues.java | 29 +-
.../queries/function/docvalues/StrDocValues.java | 1 -
.../queries/function/docvalues/package-info.java | 4 +-
.../lucene/queries/function/package-info.java | 4 +-
.../queries/function/valuesource/BoolFunction.java | 4 +-
.../function/valuesource/BytesRefFieldSource.java | 16 +-
.../valuesource/ComparisonBoolFunction.java | 22 +-
.../function/valuesource/ConstNumberSource.java | 13 +-
.../function/valuesource/ConstValueSource.java | 30 +-
.../queries/function/valuesource/DefFunction.java | 23 +-
.../function/valuesource/DivFloatFunction.java | 12 +-
.../function/valuesource/DocFreqValueSource.java | 38 +-
.../valuesource/DoubleConstValueSource.java | 20 +-
.../function/valuesource/DoubleFieldSource.java | 20 +-
.../function/valuesource/DualFloatFunction.java | 44 +-
.../function/valuesource/EnumFieldSource.java | 54 +-
.../function/valuesource/FieldCacheSource.java | 11 +-
.../function/valuesource/FloatFieldSource.java | 29 +-
.../function/valuesource/IDFValueSource.java | 35 +-
.../queries/function/valuesource/IfFunction.java | 37 +-
.../function/valuesource/IntFieldSource.java | 24 +-
.../valuesource/JoinDocFreqValueSource.java | 24 +-
.../function/valuesource/LinearFloatFunction.java | 27 +-
.../function/valuesource/LiteralValueSource.java | 18 +-
.../function/valuesource/LongFieldSource.java | 25 +-
.../function/valuesource/MaxDocValueSource.java | 17 +-
.../function/valuesource/MaxFloatFunction.java | 15 +-
.../function/valuesource/MinFloatFunction.java | 18 +-
.../function/valuesource/MultiBoolFunction.java | 16 +-
.../function/valuesource/MultiFloatFunction.java | 45 +-
.../function/valuesource/MultiFunction.java | 76 +-
.../function/valuesource/MultiValueSource.java | 8 +-
.../valuesource/MultiValuedDoubleFieldSource.java | 24 +-
.../valuesource/MultiValuedFloatFieldSource.java | 24 +-
.../valuesource/MultiValuedIntFieldSource.java | 24 +-
.../valuesource/MultiValuedLongFieldSource.java | 24 +-
.../function/valuesource/NormValueSource.java | 47 +-
.../function/valuesource/NumDocsValueSource.java | 18 +-
.../function/valuesource/PowFloatFunction.java | 16 +-
.../function/valuesource/ProductFloatFunction.java | 5 +-
.../function/valuesource/QueryValueSource.java | 52 +-
.../valuesource/RangeMapFloatFunction.java | 75 +-
.../valuesource/ReciprocalFloatFunction.java | 61 +-
.../function/valuesource/ScaleFloatFunction.java | 75 +-
.../function/valuesource/SimpleBoolFunction.java | 21 +-
.../function/valuesource/SimpleFloatFunction.java | 16 +-
.../function/valuesource/SingleFunction.java | 17 +-
.../function/valuesource/SortedSetFieldSource.java | 22 +-
.../function/valuesource/SumFloatFunction.java | 7 +-
.../valuesource/SumTotalTermFreqValueSource.java | 31 +-
.../function/valuesource/TFValueSource.java | 131 +-
.../function/valuesource/TermFreqValueSource.java | 119 +-
.../valuesource/TotalTermFreqValueSource.java | 39 +-
.../function/valuesource/VectorValueSource.java | 35 +-
.../queries/function/valuesource/package-info.java | 4 +-
.../queries/intervals/BlockIntervalsSource.java | 14 +-
.../queries/intervals/CachingMatchesIterator.java | 17 +-
.../lucene/queries/intervals/ConjunctionDISI.java | 16 +-
.../intervals/ConjunctionIntervalIterator.java | 2 -
.../intervals/ConjunctionIntervalsSource.java | 21 +-
.../intervals/ContainedByIntervalsSource.java | 6 +-
.../intervals/ContainingIntervalsSource.java | 6 +-
.../intervals/DifferenceIntervalsSource.java | 16 +-
.../queries/intervals/DisiPriorityQueue.java | 15 +-
.../lucene/queries/intervals/DisiWrapper.java | 1 -
.../intervals/DisjunctionDISIApproximation.java | 8 +-
.../intervals/DisjunctionIntervalsSource.java | 195 +-
.../lucene/queries/intervals/Disjunctions.java | 15 +-
.../intervals/ExtendedIntervalIterator.java | 15 +-
.../queries/intervals/ExtendedIntervalsSource.java | 15 +-
.../queries/intervals/FilteredIntervalsSource.java | 38 +-
.../intervals/FixedFieldIntervalsSource.java | 11 +-
.../lucene/queries/intervals/IntervalFilter.java | 15 +-
.../lucene/queries/intervals/IntervalIterator.java | 54 +-
.../lucene/queries/intervals/IntervalMatches.java | 11 +-
.../queries/intervals/IntervalMatchesIterator.java | 10 +-
.../lucene/queries/intervals/IntervalQuery.java | 102 +-
.../queries/intervals/IntervalScoreFunction.java | 27 +-
.../lucene/queries/intervals/IntervalScorer.java | 12 +-
.../apache/lucene/queries/intervals/Intervals.java | 222 +-
.../lucene/queries/intervals/IntervalsSource.java | 43 +-
.../MinimizingConjunctionMatchesIterator.java | 6 +-
.../MinimumShouldMatchIntervalsSource.java | 54 +-
.../intervals/MultiTermIntervalsSource.java | 28 +-
.../intervals/NonOverlappingIntervalsSource.java | 6 +-
.../intervals/NotContainedByIntervalsSource.java | 12 +-
.../intervals/NotContainingIntervalsSource.java | 10 +-
.../queries/intervals/OffsetIntervalsSource.java | 19 +-
.../queries/intervals/OrderedIntervalsSource.java | 32 +-
.../intervals/OverlappingIntervalsSource.java | 10 +-
.../PayloadFilteredTermIntervalsSource.java | 43 +-
.../lucene/queries/intervals/RelativeIterator.java | 3 +-
.../intervals/RepeatingIntervalsSource.java | 24 +-
.../queries/intervals/TermIntervalsSource.java | 64 +-
.../intervals/UnorderedIntervalsSource.java | 30 +-
.../lucene/queries/intervals/package-info.java | 65 +-
.../apache/lucene/queries/mlt/MoreLikeThis.java | 366 +--
.../lucene/queries/mlt/MoreLikeThisQuery.java | 42 +-
.../apache/lucene/queries/mlt/package-info.java | 4 +-
.../org/apache/lucene/queries/package-info.java | 4 +-
.../queries/payloads/AveragePayloadFunction.java | 25 +-
.../queries/payloads/MaxPayloadFunction.java | 25 +-
.../queries/payloads/MinPayloadFunction.java | 26 +-
.../lucene/queries/payloads/PayloadDecoder.java | 13 +-
.../lucene/queries/payloads/PayloadFunction.java | 37 +-
.../lucene/queries/payloads/PayloadScoreQuery.java | 86 +-
.../queries/payloads/SpanPayloadCheckQuery.java | 67 +-
.../queries/payloads/SumPayloadFunction.java | 23 +-
.../lucene/queries/payloads/package-info.java | 16 +-
.../lucene/queries/TestCommonTermsQuery.java | 308 +-
.../lucene/queries/function/FunctionTestSetup.java | 101 +-
.../function/TestDocValuesFieldSources.java | 18 +-
.../queries/function/TestFieldScoreQuery.java | 72 +-
.../queries/function/TestFunctionMatchQuery.java | 12 +-
.../function/TestFunctionQueryExplanations.java | 16 +-
.../queries/function/TestFunctionQuerySort.java | 18 +-
.../queries/function/TestFunctionRangeQuery.java | 44 +-
.../function/TestFunctionScoreExplanations.java | 11 +-
.../queries/function/TestFunctionScoreQuery.java | 170 +-
.../queries/function/TestIndexReaderFunctions.java | 80 +-
.../queries/function/TestLongNormValueSource.java | 6 +-
.../queries/function/TestSortedSetFieldSource.java | 14 +-
.../lucene/queries/function/TestValueSources.java | 562 ++--
.../docvalues/TestBoolValOfNumericDVs.java | 72 +-
.../queries/intervals/OneTimeIntervalSource.java | 14 +-
.../queries/intervals/TestDisjunctionRewrites.java | 213 +-
.../queries/intervals/TestIntervalQuery.java | 361 +-
.../lucene/queries/intervals/TestIntervals.java | 869 +++--
.../intervals/TestPayloadFilteredInterval.java | 33 +-
.../queries/intervals/TestSimplifications.java | 44 +-
.../lucene/queries/mlt/TestMoreLikeThis.java | 280 +-
.../lucene/queries/payloads/PayloadHelper.java | 54 +-
.../queries/payloads/TestPayloadCheckQuery.java | 104 +-
.../queries/payloads/TestPayloadExplanations.java | 69 +-
.../queries/payloads/TestPayloadScoreQuery.java | 217 +-
.../lucene/queries/payloads/TestPayloadSpans.java | 226 +-
.../queries/payloads/TestPayloadTermQuery.java | 152 +-
.../org/apache/lucene/spatial/SpatialTestCase.java | 4 -
.../solr/handler/FieldAnalysisRequestHandler.java | 2 -
896 files changed, 44593 insertions(+), 39638 deletions(-)
diff --git a/gradle/generation/jflex.gradle b/gradle/generation/jflex.gradle
index f6e8351..52bf08d 100644
--- a/gradle/generation/jflex.gradle
+++ b/gradle/generation/jflex.gradle
@@ -26,14 +26,6 @@ configure(rootProject) {
dependencies {
jflex "de.jflex:jflex:${scriptDepVersions['jflex']}"
}
-
- task jflex() {
- description "Regenerate sources for corresponding jflex grammar files."
- group "generation"
-
- dependsOn ":lucene:core:jflexStandardTokenizerImpl"
- dependsOn ":lucene:analysis:common:jflexUAX29URLEmailTokenizerImpl"
- }
}
// We always regenerate, no need to declare outputs.
@@ -115,6 +107,22 @@ configure(project(":lucene:core")) {
configure(project(":lucene:analysis:common")) {
+ task jflexWikipediaTokenizerImpl(type: JFlexTask) {
+ description "Regenerate WikipediaTokenizerImpl.java"
+ group "generation"
+
+ jflexFile = file('src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex')
+ skeleton = project(":lucene:core").file("src/data/jflex/skeleton.default")
+ }
+
+ task jflexClassicTokenizerImpl(type: JFlexTask) {
+ description "Regenerate ClassicTokenizerImpl.java"
+ group "generation"
+
+ jflexFile = file('src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex')
+ skeleton = project(":lucene:core").file("src/data/jflex/skeleton.default")
+ }
+
task jflexUAX29URLEmailTokenizerImpl(type: JFlexTask) {
description "Regenerate UAX29URLEmailTokenizerImpl.java"
group "generation"
@@ -166,4 +174,26 @@ configure(project(":lucene:analysis:common")) {
)
}
}
+
+ task regenerate() {
+ description "Regenerate any generated sources"
+ group "generation"
+
+ // Run regeneration tasks.
+ dependsOn jflexUAX29URLEmailTokenizerImpl
+ dependsOn jflexHTMLStripCharFilter
+ dependsOn jflexClassicTokenizerImpl
+ dependsOn jflexWikipediaTokenizerImpl
+
+ // Clean up and reformat the generated sources after generation.
+ dependsOn "tidy"
+ }
+
+ // Make sure tidy runs after generation, if they're defined.
+ tasks.matching { it.name == "tidy" }.configureEach {
+ mustRunAfter jflexUAX29URLEmailTokenizerImpl,
+ jflexHTMLStripCharFilter,
+ jflexClassicTokenizerImpl,
+ jflexWikipediaTokenizerImpl
+ }
}
diff --git a/gradle/validation/spotless.gradle b/gradle/validation/spotless.gradle
index 7276c9d..492f306 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -29,42 +29,42 @@ allprojects { prj ->
java {
// TODO: work out how to have multiple different header files (we have
// classes in the codebase that have original headers).
-
// licenseHeaderFile file("${resources}/asl-header.txt"), '^(\\s*package)'
+
lineEndings 'UNIX'
endWithNewline()
googleJavaFormat('1.9')
switch (project.path) {
// These modules are complete - all sources scanned.
+ case ":lucene:core":
+ target "src/java/**/*.java",
+ "src/test/**/*.java"
+ targetExclude "**/resources/**", "**/StandardTokenizerImpl.java"
+ break
+
case ":lucene:highlighter":
- target "src/java/**", "src/test/**"
- targetExclude "**/overview.html", "**/CambridgeMA.utf8"
+ target "src/java/**/*.java",
+ "src/test/**/*.java"
+ targetExclude "**/resources/**"
break
- // Partially complete.
- case ":lucene:core":
- target "src/java/**", "src/test/**"
- targetExclude "**/overview.html",
- "**/META-INF/**",
- "**/StandardTokenizerImpl.jflex",
- "**/StandardTokenizerImpl.java",
- "**/createLevAutomata.py",
- "**/UTF32ToUTF8.py",
- "**/gen_BulkOperation.py",
- "**/gen_Packed64SingleBlock.py",
- "**/makeEuroparlLineFile.py",
- "**/wordliststopwords.txt",
- "**/wordliststopwords_nocomment.txt",
- "**/gen_ForUtil.py"
+ case ":lucene:queries":
+ target "src/java/**/*.java",
+ "src/test/**/*.java"
+ targetExclude "**/resources/**"
break
case ":lucene:analysis:common":
- target "src/**/org/apache/lucene/analysis/ar/**",
- "src/**/org/apache/lucene/collation/**"
- targetExclude "**/resources/**"
+ target "src/**/*.java"
+ targetExclude "**/resources/**",
+ "**/HTMLStripCharFilter.java",
+ "**/UAX29URLEmailTokenizerImpl.java",
+ "**/tartarus/**"
break
+ // Partially complete.
+
// All others - disable reformatting/ checks for now.
case ":lucene:analysis:icu":
case ":lucene:analysis:kuromoji":
@@ -87,7 +87,6 @@ allprojects { prj ->
case ":lucene:memory":
case ":lucene:misc":
case ":lucene:monitor":
- case ":lucene:queries":
case ":lucene:queryparser":
case ":lucene:replicator":
case ":lucene:sandbox":
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
index 7077d2b..2e50563 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.bg;
-
import java.io.IOException;
import java.io.Reader;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -32,10 +30,9 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* {@link Analyzer} for Bulgarian.
- * <p>
- * This analyzer implements light-stemming as specified by: <i> Searching
- * Strategies for the Bulgarian Language </i>
- * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ *
+ * <p>This analyzer implements light-stemming as specified by: <i> Searching Strategies for the
+ * Bulgarian Language </i> http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
*
* @since 3.1
*/
@@ -43,32 +40,32 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Bulgarian stopwords.
- *
- * Default stopword list is from
- * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
- * BSD-Licensed.
+ *
+ * <p>Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html The
+ * stopword list is BSD-Licensed.
*/
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
- *
+ *
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
/**
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
- * class accesses the static final set the first time.;
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+ * static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
-
+
static {
try {
- DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+ DEFAULT_STOP_SET =
+ loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -76,53 +73,46 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
}
}
}
-
+
private final CharArraySet stemExclusionSet;
-
- /**
- * Builds an analyzer with the default stop words:
- * {@link #DEFAULT_STOPWORD_FILE}.
- */
+
+ /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public BulgarianAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
-
- /**
- * Builds an analyzer with the given stop words.
- */
+
+ /** Builds an analyzer with the given stop words. */
public BulgarianAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
-
+
/**
- * Builds an analyzer with the given stop words and a stem exclusion set.
- * If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter}
- * before {@link BulgarianStemFilter}.
+ * Builds an analyzer with the given stop words and a stem exclusion set. If a stem exclusion set
+ * is provided this analyzer will add a {@link SetKeywordMarkerFilter} before {@link
+ * BulgarianStemFilter}.
*/
public BulgarianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
- this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
/**
- * Creates a
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from an {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}
- * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link BulgarianStemFilter}.
+ * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
+ * the text in the provided {@link Reader}.
+ *
+ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
+ * {@link StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link StopFilter}, {@link
+ * SetKeywordMarkerFilter} if a stem exclusion set is provided and {@link
+ * BulgarianStemFilter}.
*/
@Override
public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StopFilter(result, stopwords);
- if(!stemExclusionSet.isEmpty())
+ if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
index eec74a6..9618ea6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
@@ -16,37 +16,33 @@
*/
package org.apache.lucene.analysis.bg;
-
import java.io.IOException;
-
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
- * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
- * words.
- * <p>
- * To prevent terms from being stemmed use an instance of
- * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
- * the {@link KeywordAttribute} before this {@link TokenStream}.
- * </p>
+ * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
*/
public final class BulgarianStemFilter extends TokenFilter {
private final BulgarianStemmer stemmer = new BulgarianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
-
+
public BulgarianStemFilter(final TokenStream input) {
super(input);
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- if(!keywordAttr.isKeyword()) {
+ if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
index 29fee9f..8aebc1e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.bg;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
+/**
* Factory for {@link BulgarianStemFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -32,6 +31,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.BulgarianStemFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 3.1.0
* @lucene.spi {@value #NAME}
*/
@@ -41,13 +41,13 @@ public class BulgarianStemFilterFactory extends TokenFilterFactory {
public static final String NAME = "bulgarianStem";
/** Creates a new BulgarianStemFilterFactory */
- public BulgarianStemFilterFactory(Map<String,String> args) {
+ public BulgarianStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public BulgarianStemFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
index 0f5ebb2..5f727f8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
@@ -16,46 +16,43 @@
*/
package org.apache.lucene.analysis.bg;
-
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Bulgarian.
- * <p>
- * Implements the algorithm described in:
- * <i>
- * Searching Strategies for the Bulgarian Language
- * </i>
- * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ *
+ * <p>Implements the algorithm described in: <i> Searching Strategies for the Bulgarian Language
+ * </i> http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
*/
public class BulgarianStemmer {
-
+
/**
* Stem an input buffer of Bulgarian text.
- *
+ *
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(final char s[], int len) {
if (len < 4) // do not stem
- return len;
-
- if (len > 5 && endsWith(s, len, "ища"))
+ return len;
+
+ if (len > 5 && endsWith(s, len, "ища")) {
return len - 3;
-
+ }
+
len = removeArticle(s, len);
len = removePlural(s, len);
-
+
if (len > 3) {
- if (endsWith(s, len, "я"))
+ if (endsWith(s, len, "я")) {
len--;
- if (endsWith(s, len, "а") ||
- endsWith(s, len, "о") ||
- endsWith(s, len, "е"))
+ }
+ if (endsWith(s, len, "а") || endsWith(s, len, "о") || endsWith(s, len, "е")) {
len--;
+ }
}
-
+
// the rule to rewrite ен -> н is duplicated in the paper.
// in the perl implementation referenced by the paper, this is fixed.
// (it is fixed here as well)
@@ -63,7 +60,7 @@ public class BulgarianStemmer {
s[len - 2] = 'н'; // replace with н
len--;
}
-
+
if (len > 5 && s[len - 2] == 'ъ') {
s[len - 2] = s[len - 1]; // replace ъN with N
len--;
@@ -71,49 +68,47 @@ public class BulgarianStemmer {
return len;
}
-
+
/**
* Mainly remove the definite article
+ *
* @param s input buffer
* @param len length of input buffer
* @return new stemmed length
*/
private int removeArticle(final char s[], final int len) {
- if (len > 6 && endsWith(s, len, "ият"))
- return len - 3;
-
+ if (len > 6 && endsWith(s, len, "ият")) return len - 3;
+
if (len > 5) {
- if (endsWith(s, len, "ът") ||
- endsWith(s, len, "то") ||
- endsWith(s, len, "те") ||
- endsWith(s, len, "та") ||
- endsWith(s, len, "ия"))
+ if (endsWith(s, len, "ът")
+ || endsWith(s, len, "то")
+ || endsWith(s, len, "те")
+ || endsWith(s, len, "та")
+ || endsWith(s, len, "ия")) {
return len - 2;
+ }
}
-
- if (len > 4 && endsWith(s, len, "ят"))
+
+ if (len > 4 && endsWith(s, len, "ят")) {
return len - 2;
+ }
return len;
}
-
+
private int removePlural(final char s[], final int len) {
if (len > 6) {
- if (endsWith(s, len, "овци"))
- return len - 3; // replace with о
- if (endsWith(s, len, "ове"))
- return len - 3;
+ if (endsWith(s, len, "овци")) return len - 3; // replace with о
+ if (endsWith(s, len, "ове")) return len - 3;
if (endsWith(s, len, "еве")) {
s[len - 3] = 'й'; // replace with й
return len - 2;
}
}
-
+
if (len > 5) {
- if (endsWith(s, len, "ища"))
- return len - 3;
- if (endsWith(s, len, "та"))
- return len - 2;
+ if (endsWith(s, len, "ища")) return len - 3;
+ if (endsWith(s, len, "та")) return len - 2;
if (endsWith(s, len, "ци")) {
s[len - 2] = 'к'; // replace with к
return len - 1;
@@ -122,22 +117,21 @@ public class BulgarianStemmer {
s[len - 2] = 'г'; // replace with г
return len - 1;
}
-
+
if (s[len - 3] == 'е' && s[len - 1] == 'и') {
s[len - 3] = 'я'; // replace е with я, remove и
return len - 1;
}
}
-
+
if (len > 4) {
if (endsWith(s, len, "си")) {
s[len - 2] = 'х'; // replace with х
return len - 1;
}
- if (endsWith(s, len, "и"))
- return len - 1;
+ if (endsWith(s, len, "и")) return len - 1;
}
-
+
return len;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/package-info.java
index c60a53b..d4aaeaf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Bulgarian.
- */
+/** Analyzer for Bulgarian. */
package org.apache.lucene.analysis.bg;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
index fe4d08a..dc21f14 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.bn;
-
import java.io.IOException;
import java.io.Reader;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
@@ -38,43 +36,46 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
*/
public final class BengaliAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
-
+
/**
* File containing default Bengali stopwords.
- *
- * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
- * The stopword list is BSD-Licensed.
+ *
+ * <p>Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt The
+ * stopword list is BSD-Licensed.
*/
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
private static final String STOPWORDS_COMMENT = "#";
-
+
/**
* Returns an unmodifiable instance of the default stop-words set.
+ *
* @return an unmodifiable instance of the default stop-words set.
*/
- public static CharArraySet getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
/**
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
- * accesses the static final set the first time.;
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+ * static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ DEFAULT_STOP_SET =
+ loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
throw new RuntimeException("Unable to load default stopword set");
}
}
}
-
+
/**
* Builds an analyzer with the given stop words
- *
+ *
* @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set
*/
@@ -82,43 +83,39 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
-
+
/**
- * Builds an analyzer with the given stop words
- *
+ * Builds an analyzer with the given stop words
+ *
* @param stopwords a stopword set
*/
public BengaliAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
-
- /**
- * Builds an analyzer with the default stop words:
- * {@link #DEFAULT_STOPWORD_FILE}.
- */
+
+ /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public BengaliAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
- * Creates
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * used to tokenize all the text in the provided {@link Reader}.
- *
- * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
- * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
- * if a stem exclusion set is provided, {@link BengaliStemFilter}, and
- * Bengali Stop words
+ * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
+ * the text in the provided {@link Reader}.
+ *
+ * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
+ * StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
+ * {@link IndicNormalizationFilter}, {@link BengaliNormalizationFilter}, {@link
+ * SetKeywordMarkerFilter} if a stem exclusion set is provided, {@link BengaliStemFilter}, and
+ * Bengali Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
- if (!stemExclusionSet.isEmpty())
+ if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
result = new IndicNormalizationFilter(result);
result = new BengaliNormalizationFilter(result);
result = new StopFilter(result, stopwords);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
index 46874b5..effa3c7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
@@ -16,24 +16,20 @@
*/
package org.apache.lucene.analysis.bn;
-
+import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import java.io.IOException;
-
/**
- * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
- * orthography.
- * <p>
- * In some cases the normalization may cause unrelated terms to conflate, so
- * to prevent terms from being normalized use an instance of
- * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
- * the {@link KeywordAttribute} before this {@link TokenStream}.
- * </p>
+ * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the orthography.
+ *
+ * <p>In some cases the normalization may cause unrelated terms to conflate, so to prevent terms
+ * from being normalized use an instance of {@link SetKeywordMarkerFilter} or a custom {@link
+ * TokenFilter} that sets the {@link KeywordAttribute} before this {@link TokenStream}.
+ *
* @see BengaliNormalizer
*/
public final class BengaliNormalizationFilter extends TokenFilter {
@@ -41,7 +37,7 @@ public final class BengaliNormalizationFilter extends TokenFilter {
private final BengaliNormalizer normalizer = new BengaliNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-
+
public BengaliNormalizationFilter(TokenStream input) {
super(input);
}
@@ -50,10 +46,9 @@ public final class BengaliNormalizationFilter extends TokenFilter {
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword())
- termAtt.setLength(normalizer.normalize(termAtt.buffer(),
- termAtt.length()));
+ termAtt.setLength(normalizer.normalize(termAtt.buffer(), termAtt.length()));
return true;
- }
+ }
return false;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
index 8956e6d..b95cbfb 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.bn;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
- * Factory for {@link BengaliNormalizationFilter}.
+/**
+ * Factory for {@link BengaliNormalizationFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -31,6 +30,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.BengaliNormalizationFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 7.1.0
* @lucene.spi {@value #NAME}
*/
@@ -39,13 +39,13 @@ public class BengaliNormalizationFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "bengaliNormalization";
- public BengaliNormalizationFilterFactory(Map<String,String> args) {
+ public BengaliNormalizationFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public BengaliNormalizationFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
index 1718cbc..c047fbd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
@@ -16,23 +16,20 @@
*/
package org.apache.lucene.analysis.bn;
-
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
/**
* Normalizer for Bengali.
- * <p>
- * Implements the Bengali-language specific algorithm specified in:
- * <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
- * Naushad UzZaman and Mumit Khan.
+ *
+ * <p>Implements the Bengali-language specific algorithm specified in: <i>A Double Metaphone
+ * encoding for Bangla and its application in spelling checker</i> Naushad UzZaman and Mumit Khan.
* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
- * </p>
*/
public class BengaliNormalizer {
/**
* Normalize an input buffer of Bengali text
*
- * @param s input buffer
+ * @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
@@ -40,102 +37,103 @@ public class BengaliNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
- // delete Chandrabindu
+ // delete Chandrabindu
case '\u0981':
len = delete(s, i, len);
i--;
break;
- // DirghoI kar -> RosshoI kar
+ // DirghoI kar -> RosshoI kar
case '\u09C0':
s[i] = '\u09BF';
break;
- // DirghoU kar -> RosshoU kar
+ // DirghoU kar -> RosshoU kar
case '\u09C2':
s[i] = '\u09C1';
break;
- // Khio (Ka + Hoshonto + Murdorno Sh)
+ // Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995':
- if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
+ if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
if (i == 0) {
s[i] = '\u0996';
len = delete(s, i + 2, len);
len = delete(s, i + 1, len);
} else {
- s[i+1] = '\u0996';
+ s[i + 1] = '\u0996';
len = delete(s, i + 2, len);
}
}
break;
- // Nga to Anusvara
+ // Nga to Anusvara
case '\u0999':
s[i] = '\u0982';
break;
- // Ja Phala
+ // Ja Phala
case '\u09AF':
- if(i - 2 == 0 && s[i-1] == '\u09CD') {
+ if (i - 2 == 0 && s[i - 1] == '\u09CD') {
s[i - 1] = '\u09C7';
- if(i + 1 < len && s[i+1] == '\u09BE') {
- len = delete(s, i+1, len);
+ if (i + 1 < len && s[i + 1] == '\u09BE') {
+ len = delete(s, i + 1, len);
}
len = delete(s, i, len);
- i --;
- } else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
+ i--;
+ } else if (i - 1 >= 0 && s[i - 1] == '\u09CD') {
len = delete(s, i, len);
- len = delete(s, i-1, len);
- i -=2;
+ len = delete(s, i - 1, len);
+ i -= 2;
}
break;
- // Ba Phalaa
+ // Ba Phalaa
case '\u09AC':
- if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
+ if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
break;
- if(i - 2 == 0) {
+ }
+ if (i - 2 == 0) {
len = delete(s, i, len);
len = delete(s, i - 1, len);
i -= 2;
- } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
+ } else if (i - 5 >= 0 && s[i - 3] == '\u09CD') {
len = delete(s, i, len);
- len = delete(s, i-1, len);
- i -=2;
- } else if(i - 2 >= 0){
+ len = delete(s, i - 1, len);
+ i -= 2;
+ } else if (i - 2 >= 0) {
s[i - 1] = s[i - 2];
len = delete(s, i, len);
- i --;
+ i--;
}
break;
- // Visarga
+ // Visarga
case '\u0983':
- if(i == len -1) {
- if(len <= 3) {
+ if (i == len - 1) {
+ if (len <= 3) {
s[i] = '\u09B9';
} else {
len = delete(s, i, len);
}
} else {
- s[i] = s[i+1];
+ s[i] = s[i + 1];
}
break;
- //All sh
+ // All sh
case '\u09B6':
case '\u09B7':
s[i] = '\u09B8';
break;
- //check na
+ // check na
case '\u09A3':
s[i] = '\u09A8';
break;
- //check ra
+ // check ra
case '\u09DC':
case '\u09DD':
s[i] = '\u09B0';
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
index 9787027..49cfd38 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
@@ -16,31 +16,28 @@
*/
package org.apache.lucene.analysis.bn;
-
+import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import java.io.IOException;
-
-/**
- * A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
- */
+/** A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words. */
public final class BengaliStemFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
-
+
public BengaliStemFilter(TokenStream input) {
super(input);
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttribute.isKeyword())
- termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
+ termAttribute.setLength(
+ bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
return true;
} else {
return false;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
index ca84aca..a8711b4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.bn;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenFilterFactory;
-
import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
- * Factory for {@link BengaliStemFilter}.
+/**
+ * Factory for {@link BengaliStemFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -31,6 +30,7 @@ import java.util.Map;
* <filter class="solr.BengaliStemFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 7.1.0
* @lucene.spi {@value #NAME}
*/
@@ -39,13 +39,13 @@ public class BengaliStemFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "bengaliStem";
- public BengaliStemFilterFactory(Map<String,String> args) {
+ public BengaliStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public BengaliStemFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
index 8bc555a..e07521c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
@@ -16,167 +16,148 @@
*/
package org.apache.lucene.analysis.bn;
-
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
/**
* Stemmer for Bengali.
- * <p>
- * The algorithm is based on the report in:
- * <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
- * P Sengupta and B B Chaudhuri
- * </p>
*
- * <p>
- * Few Stemmer criteria are taken from:
- * <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
- * </p>
+ * <p>The algorithm is based on the report in: <i>Natural Language Processing in an Indian Language
+ * (Bengali)-I: Verb Phrase Analysis</i> P Sengupta and B B Chaudhuri
+ *
+ * <p>Few Stemmer criteria are taken from:
+ * <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
*/
public class BengaliStemmer {
public int stem(char buffer[], int len) {
// 8
- if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
- || endsWith(buffer, len, "িতেছিলাম")
- || endsWith(buffer, len, "িতেছিলেন")
- || endsWith(buffer, len, "ইতেছিলেন")
- || endsWith(buffer, len, "িয়াছিলেন")
- || endsWith(buffer, len, "ইয়াছিলেন")
- ))
- return len - 8;
+ if (len > 9
+ && (endsWith(buffer, len, "িয়াছিলাম")
+ || endsWith(buffer, len, "িতেছিলাম")
+ || endsWith(buffer, len, "িতেছিলেন")
+ || endsWith(buffer, len, "ইতেছিলেন")
+ || endsWith(buffer, len, "িয়াছিলেন")
+ || endsWith(buffer, len, "ইয়াছিলেন"))) return len - 8;
// 7
- if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
- || endsWith(buffer, len, "িতেছিলে")
- || endsWith(buffer, len, "িয়াছিলা")
- || endsWith(buffer, len, "িয়াছিলে")
- || endsWith(buffer, len, "িতেছিলা")
- || endsWith(buffer, len, "িয়াছিলি")
-
- || endsWith(buffer, len, "য়েদেরকে")
- ))
- return len - 7;
+ if ((len > 8)
+ && (endsWith(buffer, len, "িতেছিলি")
+ || endsWith(buffer, len, "িতেছিলে")
+ || endsWith(buffer, len, "িয়াছিলা")
+ || endsWith(buffer, len, "িয়াছিলে")
+ || endsWith(buffer, len, "িতেছিলা")
+ || endsWith(buffer, len, "িয়াছিলি")
+ || endsWith(buffer, len, "য়েদেরকে"))) return len - 7;
// 6
- if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
- || endsWith(buffer, len, "িতেছেন")
- || endsWith(buffer, len, "িয়াছিস")
- || endsWith(buffer, len, "িয়াছেন")
- || endsWith(buffer, len, "েছিলাম")
- || endsWith(buffer, len, "েছিলেন")
-
- || endsWith(buffer, len, "েদেরকে")
- ))
- return len - 6;
+ if ((len > 7)
+ && (endsWith(buffer, len, "িতেছিস")
+ || endsWith(buffer, len, "িতেছেন")
+ || endsWith(buffer, len, "িয়াছিস")
+ || endsWith(buffer, len, "িয়াছেন")
+ || endsWith(buffer, len, "েছিলাম")
+ || endsWith(buffer, len, "েছিলেন")
+ || endsWith(buffer, len, "েদেরকে"))) return len - 6;
// 5
- if ((len > 6) && (endsWith(buffer, len, "িতেছি")
- || endsWith(buffer, len, "িতেছা")
- || endsWith(buffer, len, "িতেছে")
- || endsWith(buffer, len, "ছিলাম")
- || endsWith(buffer, len, "ছিলেন")
- || endsWith(buffer, len, "িয়াছি")
- || endsWith(buffer, len, "িয়াছা")
- || endsWith(buffer, len, "িয়াছে")
- || endsWith(buffer, len, "েছিলে")
- || endsWith(buffer, len, "েছিলা")
-
- || endsWith(buffer, len, "য়েদের")
- || endsWith(buffer, len, "দেরকে")
- ))
- return len - 5;
+ if ((len > 6)
+ && (endsWith(buffer, len, "িতেছি")
+ || endsWith(buffer, len, "িতেছা")
+ || endsWith(buffer, len, "িতেছে")
+ || endsWith(buffer, len, "ছিলাম")
+ || endsWith(buffer, len, "ছিলেন")
+ || endsWith(buffer, len, "িয়াছি")
+ || endsWith(buffer, len, "িয়াছা")
+ || endsWith(buffer, len, "িয়াছে")
+ || endsWith(buffer, len, "েছিলে")
+ || endsWith(buffer, len, "েছিলা")
+ || endsWith(buffer, len, "য়েদের")
+ || endsWith(buffer, len, "দেরকে"))) return len - 5;
// 4
- if ((len > 5) && (endsWith(buffer, len, "িলাম")
- || endsWith(buffer, len, "িলেন")
- || endsWith(buffer, len, "িতাম")
- || endsWith(buffer, len, "িতেন")
- || endsWith(buffer, len, "িবেন")
- || endsWith(buffer, len, "ছিলি")
- || endsWith(buffer, len, "ছিলে")
- || endsWith(buffer, len, "ছিলা")
- || endsWith(buffer, len, "তেছে")
- || endsWith(buffer, len, "িতেছ")
-
- || endsWith(buffer, len, "খানা")
- || endsWith(buffer, len, "খানি")
- || endsWith(buffer, len, "গুলো")
- || endsWith(buffer, len, "গুলি")
- || endsWith(buffer, len, "য়েরা")
- || endsWith(buffer, len, "েদের")
- ))
- return len - 4;
+ if ((len > 5)
+ && (endsWith(buffer, len, "িলাম")
+ || endsWith(buffer, len, "িলেন")
+ || endsWith(buffer, len, "িতাম")
+ || endsWith(buffer, len, "িতেন")
+ || endsWith(buffer, len, "িবেন")
+ || endsWith(buffer, len, "ছিলি")
+ || endsWith(buffer, len, "ছিলে")
+ || endsWith(buffer, len, "ছিলা")
+ || endsWith(buffer, len, "তেছে")
+ || endsWith(buffer, len, "িতেছ")
+ || endsWith(buffer, len, "খানা")
+ || endsWith(buffer, len, "খানি")
+ || endsWith(buffer, len, "গুলো")
+ || endsWith(buffer, len, "গুলি")
+ || endsWith(buffer, len, "য়েরা")
+ || endsWith(buffer, len, "েদের"))) return len - 4;
// 3
- if ((len > 4) && (endsWith(buffer, len, "লাম")
- || endsWith(buffer, len, "িলি")
- || endsWith(buffer, len, "ইলি")
- || endsWith(buffer, len, "িলে")
- || endsWith(buffer, len, "ইলে")
- || endsWith(buffer, len, "লেন")
- || endsWith(buffer, len, "িলা")
- || endsWith(buffer, len, "ইলা")
- || endsWith(buffer, len, "তাম")
- || endsWith(buffer, len, "িতি")
- || endsWith(buffer, len, "ইতি")
- || endsWith(buffer, len, "িতে")
- || endsWith(buffer, len, "ইতে")
- || endsWith(buffer, len, "তেন")
- || endsWith(buffer, len, "িতা")
- || endsWith(buffer, len, "িবা")
- || endsWith(buffer, len, "ইবা")
- || endsWith(buffer, len, "িবি")
- || endsWith(buffer, len, "ইবি")
- || endsWith(buffer, len, "বেন")
- || endsWith(buffer, len, "িবে")
- || endsWith(buffer, len, "ইবে")
- || endsWith(buffer, len, "ছেন")
-
- || endsWith(buffer, len, "য়োন")
- || endsWith(buffer, len, "য়ের")
- || endsWith(buffer, len, "েরা")
- || endsWith(buffer, len, "দের")
- ))
- return len - 3;
+ if ((len > 4)
+ && (endsWith(buffer, len, "লাম")
+ || endsWith(buffer, len, "িলি")
+ || endsWith(buffer, len, "ইলি")
+ || endsWith(buffer, len, "িলে")
+ || endsWith(buffer, len, "ইলে")
+ || endsWith(buffer, len, "লেন")
+ || endsWith(buffer, len, "িলা")
+ || endsWith(buffer, len, "ইলা")
+ || endsWith(buffer, len, "তাম")
+ || endsWith(buffer, len, "িতি")
+ || endsWith(buffer, len, "ইতি")
+ || endsWith(buffer, len, "িতে")
+ || endsWith(buffer, len, "ইতে")
+ || endsWith(buffer, len, "তেন")
+ || endsWith(buffer, len, "িতা")
+ || endsWith(buffer, len, "িবা")
+ || endsWith(buffer, len, "ইবা")
+ || endsWith(buffer, len, "িবি")
+ || endsWith(buffer, len, "ইবি")
+ || endsWith(buffer, len, "বেন")
+ || endsWith(buffer, len, "িবে")
+ || endsWith(buffer, len, "ইবে")
+ || endsWith(buffer, len, "ছেন")
+ || endsWith(buffer, len, "য়োন")
+ || endsWith(buffer, len, "য়ের")
+ || endsWith(buffer, len, "েরা")
+ || endsWith(buffer, len, "দের"))) return len - 3;
// 2
- if ((len > 3) && (endsWith(buffer, len, "িস")
- || endsWith(buffer, len, "েন")
- || endsWith(buffer, len, "লি")
- || endsWith(buffer, len, "লে")
- || endsWith(buffer, len, "লা")
- || endsWith(buffer, len, "তি")
- || endsWith(buffer, len, "তে")
- || endsWith(buffer, len, "তা")
- || endsWith(buffer, len, "বি")
- || endsWith(buffer, len, "বে")
- || endsWith(buffer, len, "বা")
- || endsWith(buffer, len, "ছি")
- || endsWith(buffer, len, "ছা")
- || endsWith(buffer, len, "ছে")
- || endsWith(buffer, len, "ুন")
- || endsWith(buffer, len, "ুক")
-
- || endsWith(buffer, len, "টা")
- || endsWith(buffer, len, "টি")
- || endsWith(buffer, len, "নি")
- || endsWith(buffer, len, "ের")
- || endsWith(buffer, len, "তে")
- || endsWith(buffer, len, "রা")
- || endsWith(buffer, len, "কে")
- ))
- return len - 2;
+ if ((len > 3)
+ && (endsWith(buffer, len, "িস")
+ || endsWith(buffer, len, "েন")
+ || endsWith(buffer, len, "লি")
+ || endsWith(buffer, len, "লে")
+ || endsWith(buffer, len, "লা")
+ || endsWith(buffer, len, "তি")
+ || endsWith(buffer, len, "তে")
+ || endsWith(buffer, len, "তা")
+ || endsWith(buffer, len, "বি")
+ || endsWith(buffer, len, "বে")
+ || endsWith(buffer, len, "বা")
+ || endsWith(buffer, len, "ছি")
+ || endsWith(buffer, len, "ছা")
+ || endsWith(buffer, len, "ছে")
+ || endsWith(buffer, len, "ুন")
+ || endsWith(buffer, len, "ুক")
+ || endsWith(buffer, len, "টা")
+ || endsWith(buffer, len, "টি")
+ || endsWith(buffer, len, "নি")
+ || endsWith(buffer, len, "ের")
+ || endsWith(buffer, len, "তে")
+ || endsWith(buffer, len, "রা")
+ || endsWith(buffer, len, "কে"))) return len - 2;
// 1
- if ((len > 2) && (endsWith(buffer, len, "ি")
- || endsWith(buffer, len, "ী")
- || endsWith(buffer, len, "া")
- || endsWith(buffer, len, "ো")
- || endsWith(buffer, len, "ে")
- || endsWith(buffer, len, "ব")
- || endsWith(buffer, len, "ত")
- ))
- return len - 1;
+ if ((len > 2)
+ && (endsWith(buffer, len, "ি")
+ || endsWith(buffer, len, "ী")
+ || endsWith(buffer, len, "া")
+ || endsWith(buffer, len, "ো")
+ || endsWith(buffer, len, "ে")
+ || endsWith(buffer, len, "ব")
+ || endsWith(buffer, len, "ত"))) return len - 1;
return len;
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
index eea39a9..d272858 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Bengali Language.
- */
+/** Analyzer for Bengali Language. */
package org.apache.lucene.analysis.bn;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
index c37f7d7..b70768e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
@@ -16,21 +16,19 @@
*/
package org.apache.lucene.analysis.boost;
+import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;
-import java.io.IOException;
-
-
/**
* Characters before the delimiter are the "token", those after are the boost.
- * <p>
- * For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
- * and 0.7 is the boost.
- * <p>
- * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ *
+ * <p>For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token and 0.7
+ * is the boost.
+ *
+ * <p>Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
public final class DelimitedBoostTokenFilter extends TokenFilter {
private final char delimiter;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
index 7965ece..71f42bd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
@@ -16,13 +16,13 @@
*/
package org.apache.lucene.analysis.boost;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenFilterFactory;
-
import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link DelimitedBoostTokenFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -35,18 +35,15 @@ import java.util.Map;
*/
public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
- /**
- * SPI name
- */
+ /** SPI name */
public static final String NAME = "delimitedBoost";
+
public static final String DELIMITER_ATTR = "delimiter";
public static final char DEFAULT_DELIMITER = '|';
private final char delimiter;
- /**
- * Creates a new DelimitedPayloadTokenFilterFactory
- */
+ /** Creates a new DelimitedPayloadTokenFilterFactory */
public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
super(args);
delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
@@ -64,5 +61,4 @@ public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
public DelimitedBoostTokenFilter create(TokenStream input) {
return new DelimitedBoostTokenFilter(input, delimiter);
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
index 9bae5dc..074baba 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Provides various convenience classes for creating boosts on Tokens.
- */
+/** Provides various convenience classes for creating boosts on Tokens. */
package org.apache.lucene.analysis.boost;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 93c565f..0520ba7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -16,11 +16,9 @@
*/
package org.apache.lucene.analysis.br;
-
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -35,37 +33,39 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.IOUtils;
/**
- * {@link Analyzer} for Brazilian Portuguese language.
- * <p>
- * Supports an external list of stopwords (words that
- * will not be indexed at all) and an external list of exclusions (words that will
- * not be stemmed, but indexed).
- * </p>
+ * {@link Analyzer} for Brazilian Portuguese language.
+ *
+ * <p>Supports an external list of stopwords (words that will not be indexed at all) and an external
+ * list of exclusions (words that will not be stemmed, but indexed).
*
- * <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ * <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version} dependent
+ * settings as {@link StandardAnalyzer}.
*
* @since 3.1
*/
public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Brazilian Portuguese stopwords. */
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
-
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
/**
* Returns an unmodifiable instance of the default stop-words set.
+ *
* @return an unmodifiable instance of the default stop-words set.
*/
- public static CharArraySet getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
-
+
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
- DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#");
+ DEFAULT_STOP_SET =
+ WordlistLoader.getWordSet(
+ IOUtils.getDecodingReader(
+ BrazilianAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8),
+ "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -74,34 +74,27 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
}
}
-
- /**
- * Contains words that should be indexed but not stemmed.
- */
+ /** Contains words that should be indexed but not stemmed. */
private CharArraySet excltable = CharArraySet.EMPTY_SET;
- /**
- * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
- */
+ /** Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). */
public BrazilianAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
- *
- * @param stopwords
- * a stopword set
+ *
+ * @param stopwords a stopword set
*/
public BrazilianAnalyzer(CharArraySet stopwords) {
- super(stopwords);
+ super(stopwords);
}
/**
* Builds an analyzer with the given stop words and stemming exclusion words
- *
- * @param stopwords
- * a stopword set
+ *
+ * @param stopwords a stopword set
*/
public BrazilianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
this(stopwords);
@@ -109,21 +102,19 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Creates
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * used to tokenize all the text in the provided {@link Reader}.
- *
- * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}
- * , and {@link BrazilianStemFilter}.
+ * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
+ * the text in the provided {@link Reader}.
+ *
+ * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
+ * StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link StopFilter}, and {@link
+ * BrazilianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StopFilter(result, stopwords);
- if(excltable != null && !excltable.isEmpty())
+ if (excltable != null && !excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
@@ -133,4 +124,3 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
return new LowerCaseFilter(in);
}
}
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
index e605df5..6e8ff1f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.br;
-
import java.io.IOException;
import java.util.Set;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -28,33 +26,31 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
- * <p>
- * To prevent terms from being stemmed use an instance of
- * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
- * the {@link KeywordAttribute} before this {@link TokenStream}.
- * </p>
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
* @see SetKeywordMarkerFilter
- *
*/
public final class BrazilianStemFilter extends TokenFilter {
- /**
- * {@link BrazilianStemmer} in use by this filter.
- */
+ /** {@link BrazilianStemmer} in use by this filter. */
private BrazilianStemmer stemmer = new BrazilianStemmer();
+
private Set<?> exclusions = null;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
/**
- * Creates a new BrazilianStemFilter
- *
- * @param in the source {@link TokenStream}
+ * Creates a new BrazilianStemFilter
+ *
+ * @param in the source {@link TokenStream}
*/
public BrazilianStemFilter(TokenStream in) {
super(in);
}
-
+
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
@@ -63,8 +59,7 @@ public final class BrazilianStemFilter extends TokenFilter {
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
final String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.equals(term))
- termAtt.setEmpty().append(s);
+ if ((s != null) && !s.equals(term)) termAtt.setEmpty().append(s);
}
return true;
} else {
@@ -72,5 +67,3 @@ public final class BrazilianStemFilter extends TokenFilter {
}
}
}
-
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
index b88446f..634e1cc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.br;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
+/**
* Factory for {@link BrazilianStemFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -40,15 +39,15 @@ public class BrazilianStemFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "brazilianStem";
-
+
/** Creates a new BrazilianStemFilterFactory */
- public BrazilianStemFilterFactory(Map<String,String> args) {
+ public BrazilianStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public BrazilianStemFilterFactory() {
throw defaultCtorException();
@@ -59,4 +58,3 @@ public class BrazilianStemFilterFactory extends TokenFilterFactory {
return new BrazilianStemFilter(in);
}
}
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
index 2737358..6958cbe 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java
@@ -18,53 +18,47 @@ package org.apache.lucene.analysis.br;
import java.util.Locale;
-
-/**
- * A stemmer for Brazilian Portuguese words.
- */
+/** A stemmer for Brazilian Portuguese words. */
public class BrazilianStemmer {
private static final Locale locale = new Locale("pt", "BR");
- /**
- * Changed term
- */
- private String TERM ;
- private String CT ;
- private String R1 ;
- private String R2 ;
- private String RV ;
+ /** Changed term */
+ private String TERM;
+ private String CT;
+ private String R1;
+ private String R2;
+ private String RV;
- public BrazilianStemmer() {
- }
+ public BrazilianStemmer() {}
/**
* Stems the given term to an unique <code>discriminator</code>.
*
- * @param term The term that should be stemmed.
- * @return Discriminator for <code>term</code>
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <code>term</code>
*/
- protected String stem( String term ) {
- boolean altered = false ; // altered the term
+ protected String stem(String term) {
+ boolean altered = false; // altered the term
// creates CT
- createCT(term) ;
+ createCT(term);
- if ( !isIndexable( CT ) ) {
+ if (!isIndexable(CT)) {
return null;
}
- if ( !isStemmable( CT ) ) {
- return CT ;
+ if (!isStemmable(CT)) {
+ return CT;
}
- R1 = getR1(CT) ;
- R2 = getR1(R1) ;
- RV = getRV(CT) ;
- TERM = term + ";" +CT ;
+ R1 = getR1(CT);
+ R2 = getR1(R1);
+ RV = getRV(CT);
+ TERM = term + ";" + CT;
- altered = step1() ;
+ altered = step1();
if (!altered) {
- altered = step2() ;
+ altered = step2();
}
if (altered) {
@@ -73,20 +67,20 @@ public class BrazilianStemmer {
step4();
}
- step5() ;
+ step5();
- return CT ;
+ return CT;
}
/**
* Checks a term if it can be processed correctly.
*
- * @return true if, and only if, the given term consists in letters.
+ * @return true if, and only if, the given term consists in letters.
*/
- private boolean isStemmable( String term ) {
- for ( int c = 0; c < term.length(); c++ ) {
+ private boolean isStemmable(String term) {
+ for (int c = 0; c < term.length(); c++) {
// Discard terms that contain non-letter characters.
- if ( !Character.isLetter(term.charAt(c))) {
+ if (!Character.isLetter(term.charAt(c))) {
return false;
}
}
@@ -96,10 +90,10 @@ public class BrazilianStemmer {
/**
* Checks a term if it can be processed indexed.
*
- * @return true if it can be indexed
+ * @return true if it can be indexed
*/
- private boolean isIndexable( String term ) {
- return (term.length() < 30) && (term.length() > 2) ;
+ private boolean isIndexable(String term) {
+ return (term.length() < 30) && (term.length() > 2);
}
/**
@@ -107,179 +101,165 @@ public class BrazilianStemmer {
*
* @return true if is vowel
*/
- private boolean isVowel( char value ) {
- return (value == 'a') ||
- (value == 'e') ||
- (value == 'i') ||
- (value == 'o') ||
- (value == 'u') ;
+ private boolean isVowel(char value) {
+ return (value == 'a') || (value == 'e') || (value == 'i') || (value == 'o') || (value == 'u');
}
/**
* Gets R1
*
- * R1 - is the region after the first non-vowel following a vowel,
- * or is the null region at the end of the word if there is
- * no such non-vowel.
+ * <p>R1 - is the region after the first non-vowel following a vowel, or is the null region at the
+ * end of the word if there is no such non-vowel.
*
* @return null or a string representing R1
*/
- private String getR1( String value ) {
- int i;
- int j;
+ private String getR1(String value) {
+ int i;
+ int j;
// be-safe !!!
if (value == null) {
- return null ;
+ return null;
}
// find 1st vowel
- i = value.length()-1 ;
- for (j=0 ; j < i ; j++) {
+ i = value.length() - 1;
+ for (j = 0; j < i; j++) {
if (isVowel(value.charAt(j))) {
- break ;
+ break;
}
}
if (!(j < i)) {
- return null ;
+ return null;
}
// find 1st non-vowel
- for ( ; j < i ; j++) {
+ for (; j < i; j++) {
if (!(isVowel(value.charAt(j)))) {
- break ;
+ break;
}
}
if (!(j < i)) {
- return null ;
+ return null;
}
- return value.substring(j+1) ;
+ return value.substring(j + 1);
}
/**
* Gets RV
*
- * RV - IF the second letter is a consonant, RV is the region after
- * the next following vowel,
+ * <p>RV - IF the second letter is a consonant, RV is the region after the next following vowel,
*
- * OR if the first two letters are vowels, RV is the region
- * after the next consonant,
+ * <p>OR if the first two letters are vowels, RV is the region after the next consonant,
*
- * AND otherwise (consonant-vowel case) RV is the region after
- * the third letter.
+ * <p>AND otherwise (consonant-vowel case) RV is the region after the third letter.
*
- * BUT RV is the end of the word if this positions cannot be
- * found.
+ * <p>BUT RV is the end of the word if this positions cannot be found.
*
* @return null or a string representing RV
*/
- private String getRV( String value ) {
- int i;
- int j;
+ private String getRV(String value) {
+ int i;
+ int j;
// be-safe !!!
if (value == null) {
- return null ;
+ return null;
}
- i = value.length()-1 ;
+ i = value.length() - 1;
// RV - IF the second letter is a consonant, RV is the region after
// the next following vowel,
if ((i > 0) && !isVowel(value.charAt(1))) {
// find 1st vowel
- for (j=2 ; j < i ; j++) {
+ for (j = 2; j < i; j++) {
if (isVowel(value.charAt(j))) {
- break ;
+ break;
}
}
if (j < i) {
- return value.substring(j+1) ;
+ return value.substring(j + 1);
}
}
-
// RV - OR if the first two letters are vowels, RV is the region
// after the next consonant,
- if ((i > 1) &&
- isVowel(value.charAt(0)) &&
- isVowel(value.charAt(1))) {
+ if ((i > 1) && isVowel(value.charAt(0)) && isVowel(value.charAt(1))) {
// find 1st consoant
- for (j=2 ; j < i ; j++) {
+ for (j = 2; j < i; j++) {
if (!isVowel(value.charAt(j))) {
- break ;
+ break;
}
}
if (j < i) {
- return value.substring(j+1) ;
+ return value.substring(j + 1);
}
}
// RV - AND otherwise (consonant-vowel case) RV is the region after
// the third letter.
if (i > 2) {
- return value.substring(3) ;
+ return value.substring(3);
}
- return null ;
+ return null;
}
/**
- * 1) Turn to lowercase
- * 2) Remove accents
- * 3) ã -> a ; õ -> o
- * 4) ç -> c
+ * 1) Turn to lowercase 2) Remove accents 3) ã -> a ; õ -> o 4) ç -> c
*
* @return null or a string transformed
*/
- private String changeTerm( String value ) {
- int j;
- String r = "" ;
+ private String changeTerm(String value) {
+ int j;
+ String r = "";
// be-safe !!!
if (value == null) {
- return null ;
+ return null;
}
- value = value.toLowerCase(locale) ;
- for (j=0 ; j < value.length() ; j++) {
- if ((value.charAt(j) == 'á') ||
- (value.charAt(j) == 'â') ||
- (value.charAt(j) == 'ã')) {
- r= r + "a" ; continue ;
+ value = value.toLowerCase(locale);
+ for (j = 0; j < value.length(); j++) {
+ if ((value.charAt(j) == 'á') || (value.charAt(j) == 'â') || (value.charAt(j) == 'ã')) {
+ r = r + "a";
+ continue;
}
- if ((value.charAt(j) == 'é') ||
- (value.charAt(j) == 'ê')) {
- r= r + "e" ; continue ;
+ if ((value.charAt(j) == 'é') || (value.charAt(j) == 'ê')) {
+ r = r + "e";
+ continue;
}
if (value.charAt(j) == 'í') {
- r= r + "i" ; continue ;
+ r = r + "i";
+ continue;
}
- if ((value.charAt(j) == 'ó') ||
- (value.charAt(j) == 'ô') ||
- (value.charAt(j) == 'õ')) {
- r= r + "o" ; continue ;
+ if ((value.charAt(j) == 'ó') || (value.charAt(j) == 'ô') || (value.charAt(j) == 'õ')) {
+ r = r + "o";
+ continue;
}
- if ((value.charAt(j) == 'ú') ||
- (value.charAt(j) == 'ü')) {
- r= r + "u" ; continue ;
+ if ((value.charAt(j) == 'ú') || (value.charAt(j) == 'ü')) {
+ r = r + "u";
+ continue;
}
if (value.charAt(j) == 'ç') {
- r= r + "c" ; continue ;
+ r = r + "c";
+ continue;
}
if (value.charAt(j) == 'ñ') {
- r= r + "n" ; continue ;
+ r = r + "n";
+ continue;
}
- r= r+ value.charAt(j) ;
+ r = r + value.charAt(j);
}
- return r ;
+ return r;
}
/**
@@ -287,18 +267,18 @@ public class BrazilianStemmer {
*
* @return true if the string ends with the specified suffix
*/
- private boolean suffix( String value, String suffix ) {
+ private boolean suffix(String value, String suffix) {
// be-safe !!!
if ((value == null) || (suffix == null)) {
- return false ;
+ return false;
}
if (suffix.length() > value.length()) {
- return false ;
+ return false;
}
- return value.substring(value.length()-suffix.length()).equals(suffix);
+ return value.substring(value.length() - suffix.length()).equals(suffix);
}
/**
@@ -306,22 +286,20 @@ public class BrazilianStemmer {
*
* @return the replaced String
*/
- private String replaceSuffix( String value, String toReplace, String changeTo ) {
- String vvalue ;
+ private String replaceSuffix(String value, String toReplace, String changeTo) {
+ String vvalue;
// be-safe !!!
- if ((value == null) ||
- (toReplace == null) ||
- (changeTo == null) ) {
- return value ;
+ if ((value == null) || (toReplace == null) || (changeTo == null)) {
+ return value;
}
- vvalue = removeSuffix(value,toReplace) ;
+ vvalue = removeSuffix(value, toReplace);
if (value.equals(vvalue)) {
- return value ;
+ return value;
} else {
- return vvalue + changeTo ;
+ return vvalue + changeTo;
}
}
@@ -330,15 +308,13 @@ public class BrazilianStemmer {
*
* @return the String without the suffix
*/
- private String removeSuffix( String value, String toRemove ) {
+ private String removeSuffix(String value, String toRemove) {
// be-safe !!!
- if ((value == null) ||
- (toRemove == null) ||
- !suffix(value,toRemove) ) {
- return value ;
+ if ((value == null) || (toRemove == null) || !suffix(value, toRemove)) {
+ return value;
}
- return value.substring(0,value.length()-toRemove.length()) ;
+ return value.substring(0, value.length() - toRemove.length());
}
/**
@@ -346,679 +322,823 @@ public class BrazilianStemmer {
*
* @return true if the suffix is preceded
*/
- private boolean suffixPreceded( String value, String suffix, String preceded ) {
+ private boolean suffixPreceded(String value, String suffix, String preceded) {
// be-safe !!!
- if ((value == null) ||
- (suffix == null) ||
- (preceded == null) ||
- !suffix(value,suffix) ) {
- return false ;
+ if ((value == null) || (suffix == null) || (preceded == null) || !suffix(value, suffix)) {
+ return false;
}
- return suffix(removeSuffix(value,suffix),preceded) ;
+ return suffix(removeSuffix(value, suffix), preceded);
}
- /**
- * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
- */
- private void createCT( String term ) {
- CT = changeTerm(term) ;
+ /** Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. */
+ private void createCT(String term) {
+ CT = changeTerm(term);
- if (CT.length() < 2) return ;
+ if (CT.length() < 2) return;
// if the first character is ... , remove it
- if ((CT.charAt(0) == '"') ||
- (CT.charAt(0) == '\'') ||
- (CT.charAt(0) == '-') ||
- (CT.charAt(0) == ',') ||
- (CT.charAt(0) == ';') ||
- (CT.charAt(0) == '.') ||
- (CT.charAt(0) == '?') ||
- (CT.charAt(0) == '!')
- ) {
- CT = CT.substring(1);
+ if ((CT.charAt(0) == '"')
+ || (CT.charAt(0) == '\'')
+ || (CT.charAt(0) == '-')
+ || (CT.charAt(0) == ',')
+ || (CT.charAt(0) == ';')
+ || (CT.charAt(0) == '.')
+ || (CT.charAt(0) == '?')
+ || (CT.charAt(0) == '!')) {
+ CT = CT.substring(1);
}
- if (CT.length() < 2) return ;
+ if (CT.length() < 2) return;
// if the last character is ... , remove it
- if ((CT.charAt(CT.length()-1) == '-') ||
- (CT.charAt(CT.length()-1) == ',') ||
- (CT.charAt(CT.length()-1) == ';') ||
- (CT.charAt(CT.length()-1) == '.') ||
- (CT.charAt(CT.length()-1) == '?') ||
- (CT.charAt(CT.length()-1) == '!') ||
- (CT.charAt(CT.length()-1) == '\'') ||
- (CT.charAt(CT.length()-1) == '"')
- ) {
- CT = CT.substring(0,CT.length()-1);
+ if ((CT.charAt(CT.length() - 1) == '-')
+ || (CT.charAt(CT.length() - 1) == ',')
+ || (CT.charAt(CT.length() - 1) == ';')
+ || (CT.charAt(CT.length() - 1) == '.')
+ || (CT.charAt(CT.length() - 1) == '?')
+ || (CT.charAt(CT.length() - 1) == '!')
+ || (CT.charAt(CT.length() - 1) == '\'')
+ || (CT.charAt(CT.length() - 1) == '"')) {
+ CT = CT.substring(0, CT.length() - 1);
}
}
-
/**
- * Standard suffix removal.
- * Search for the longest among the following suffixes, and perform
- * the following actions:
+ * Standard suffix removal. Search for the longest among the following suffixes, and perform the
+ * following actions:
*
* @return false if no ending was removed
*/
private boolean step1() {
- if (CT == null) return false ;
+ if (CT == null) return false;
// suffix length = 7
- if (suffix(CT,"uciones") && suffix(R2,"uciones")) {
- CT = replaceSuffix(CT,"uciones","u") ; return true;
+ if (suffix(CT, "uciones") && suffix(R2, "uciones")) {
+ CT = replaceSuffix(CT, "uciones", "u");
+ return true;
}
// suffix length = 6
if (CT.length() >= 6) {
- if (suffix(CT,"imentos") && suffix(R2,"imentos")) {
- CT = removeSuffix(CT,"imentos") ; return true;
+ if (suffix(CT, "imentos") && suffix(R2, "imentos")) {
+ CT = removeSuffix(CT, "imentos");
+ return true;
}
- if (suffix(CT,"amentos") && suffix(R2,"amentos")) {
- CT = removeSuffix(CT,"amentos") ; return true;
+ if (suffix(CT, "amentos") && suffix(R2, "amentos")) {
+ CT = removeSuffix(CT, "amentos");
+ return true;
}
- if (suffix(CT,"adores") && suffix(R2,"adores")) {
- CT = removeSuffix(CT,"adores") ; return true;
+ if (suffix(CT, "adores") && suffix(R2, "adores")) {
+ CT = removeSuffix(CT, "adores");
+ return true;
}
- if (suffix(CT,"adoras") && suffix(R2,"adoras")) {
- CT = removeSuffix(CT,"adoras") ; return true;
+ if (suffix(CT, "adoras") && suffix(R2, "adoras")) {
+ CT = removeSuffix(CT, "adoras");
+ return true;
}
- if (suffix(CT,"logias") && suffix(R2,"logias")) {
- replaceSuffix(CT,"logias","log") ; return true;
+ if (suffix(CT, "logias") && suffix(R2, "logias")) {
+ replaceSuffix(CT, "logias", "log");
+ return true;
}
- if (suffix(CT,"encias") && suffix(R2,"encias")) {
- CT = replaceSuffix(CT,"encias","ente") ; return true;
+ if (suffix(CT, "encias") && suffix(R2, "encias")) {
+ CT = replaceSuffix(CT, "encias", "ente");
+ return true;
}
- if (suffix(CT,"amente") && suffix(R1,"amente")) {
- CT = removeSuffix(CT,"amente") ; return true;
+ if (suffix(CT, "amente") && suffix(R1, "amente")) {
+ CT = removeSuffix(CT, "amente");
+ return true;
}
- if (suffix(CT,"idades") && suffix(R2,"idades")) {
- CT = removeSuffix(CT,"idades") ; return true;
+ if (suffix(CT, "idades") && suffix(R2, "idades")) {
+ CT = removeSuffix(CT, "idades");
+ return true;
}
}
// suffix length = 5
if (CT.length() >= 5) {
- if (suffix(CT,"acoes") && suffix(R2,"acoes")) {
- CT = removeSuffix(CT,"acoes") ; return true;
+ if (suffix(CT, "acoes") && suffix(R2, "acoes")) {
+ CT = removeSuffix(CT, "acoes");
+ return true;
}
- if (suffix(CT,"imento") && suffix(R2,"imento")) {
- CT = removeSuffix(CT,"imento") ; return true;
+ if (suffix(CT, "imento") && suffix(R2, "imento")) {
+ CT = removeSuffix(CT, "imento");
+ return true;
}
- if (suffix(CT,"amento") && suffix(R2,"amento")) {
- CT = removeSuffix(CT,"amento") ; return true;
+ if (suffix(CT, "amento") && suffix(R2, "amento")) {
+ CT = removeSuffix(CT, "amento");
+ return true;
}
- if (suffix(CT,"adora") && suffix(R2,"adora")) {
- CT = removeSuffix(CT,"adora") ; return true;
+ if (suffix(CT, "adora") && suffix(R2, "adora")) {
+ CT = removeSuffix(CT, "adora");
+ return true;
}
- if (suffix(CT,"ismos") && suffix(R2,"ismos")) {
- CT = removeSuffix(CT,"ismos") ; return true;
+ if (suffix(CT, "ismos") && suffix(R2, "ismos")) {
+ CT = removeSuffix(CT, "ismos");
+ return true;
}
- if (suffix(CT,"istas") && suffix(R2,"istas")) {
- CT = removeSuffix(CT,"istas") ; return true;
+ if (suffix(CT, "istas") && suffix(R2, "istas")) {
+ CT = removeSuffix(CT, "istas");
+ return true;
}
- if (suffix(CT,"logia") && suffix(R2,"logia")) {
- CT = replaceSuffix(CT,"logia","log") ; return true;
+ if (suffix(CT, "logia") && suffix(R2, "logia")) {
+ CT = replaceSuffix(CT, "logia", "log");
+ return true;
}
- if (suffix(CT,"ucion") && suffix(R2,"ucion")) {
- CT = replaceSuffix(CT,"ucion","u") ; return true;
+ if (suffix(CT, "ucion") && suffix(R2, "ucion")) {
+ CT = replaceSuffix(CT, "ucion", "u");
+ return true;
}
- if (suffix(CT,"encia") && suffix(R2,"encia")) {
- CT = replaceSuffix(CT,"encia","ente") ; return true;
+ if (suffix(CT, "encia") && suffix(R2, "encia")) {
+ CT = replaceSuffix(CT, "encia", "ente");
+ return true;
}
- if (suffix(CT,"mente") && suffix(R2,"mente")) {
- CT = removeSuffix(CT,"mente") ; return true;
+ if (suffix(CT, "mente") && suffix(R2, "mente")) {
+ CT = removeSuffix(CT, "mente");
+ return true;
}
- if (suffix(CT,"idade") && suffix(R2,"idade")) {
- CT = removeSuffix(CT,"idade") ; return true;
+ if (suffix(CT, "idade") && suffix(R2, "idade")) {
+ CT = removeSuffix(CT, "idade");
+ return true;
}
}
// suffix length = 4
if (CT.length() >= 4) {
- if (suffix(CT,"acao") && suffix(R2,"acao")) {
- CT = removeSuffix(CT,"acao") ; return true;
+ if (suffix(CT, "acao") && suffix(R2, "acao")) {
+ CT = removeSuffix(CT, "acao");
+ return true;
}
- if (suffix(CT,"ezas") && suffix(R2,"ezas")) {
- CT = removeSuffix(CT,"ezas") ; return true;
+ if (suffix(CT, "ezas") && suffix(R2, "ezas")) {
+ CT = removeSuffix(CT, "ezas");
+ return true;
}
- if (suffix(CT,"icos") && suffix(R2,"icos")) {
- CT = removeSuffix(CT,"icos") ; return true ;
+ if (suffix(CT, "icos") && suffix(R2, "icos")) {
+ CT = removeSuffix(CT, "icos");
+ return true;
}
- if (suffix(CT,"icas") && suffix(R2,"icas")) {
- CT = removeSuffix(CT,"icas") ; return true ;
+ if (suffix(CT, "icas") && suffix(R2, "icas")) {
+ CT = removeSuffix(CT, "icas");
+ return true;
}
- if (suffix(CT,"ismo") && suffix(R2,"ismo")) {
- CT = removeSuffix(CT,"ismo") ; return true ;
+ if (suffix(CT, "ismo") && suffix(R2, "ismo")) {
+ CT = removeSuffix(CT, "ismo");
+ return true;
}
- if (suffix(CT,"avel") && suffix(R2,"avel")) {
- CT = removeSuffix(CT,"avel") ; return true ;
+ if (suffix(CT, "avel") && suffix(R2, "avel")) {
+ CT = removeSuffix(CT, "avel");
+ return true;
}
- if (suffix(CT,"ivel") && suffix(R2,"ivel")) {
- CT = removeSuffix(CT,"ivel") ; return true ;
+ if (suffix(CT, "ivel") && suffix(R2, "ivel")) {
+ CT = removeSuffix(CT, "ivel");
+ return true;
}
- if (suffix(CT,"ista") && suffix(R2,"ista")) {
- CT = removeSuffix(CT,"ista") ; return true ;
+ if (suffix(CT, "ista") && suffix(R2, "ista")) {
+ CT = removeSuffix(CT, "ista");
+ return true;
}
- if (suffix(CT,"osos") && suffix(R2,"osos")) {
- CT = removeSuffix(CT,"osos") ; return true ;
+ if (suffix(CT, "osos") && suffix(R2, "osos")) {
+ CT = removeSuffix(CT, "osos");
+ return true;
}
- if (suffix(CT,"osas") && suffix(R2,"osas")) {
- CT = removeSuffix(CT,"osas") ; return true ;
+ if (suffix(CT, "osas") && suffix(R2, "osas")) {
+ CT = removeSuffix(CT, "osas");
+ return true;
}
- if (suffix(CT,"ador") && suffix(R2,"ador")) {
- CT = removeSuffix(CT,"ador") ; return true ;
+ if (suffix(CT, "ador") && suffix(R2, "ador")) {
+ CT = removeSuffix(CT, "ador");
+ return true;
}
- if (suffix(CT,"ivas") && suffix(R2,"ivas")) {
- CT = removeSuffix(CT,"ivas") ; return true ;
+ if (suffix(CT, "ivas") && suffix(R2, "ivas")) {
+ CT = removeSuffix(CT, "ivas");
+ return true;
}
- if (suffix(CT,"ivos") && suffix(R2,"ivos")) {
- CT = removeSuffix(CT,"ivos") ; return true ;
+ if (suffix(CT, "ivos") && suffix(R2, "ivos")) {
+ CT = removeSuffix(CT, "ivos");
+ return true;
}
- if (suffix(CT,"iras") &&
- suffix(RV,"iras") &&
- suffixPreceded(CT,"iras","e")) {
- CT = replaceSuffix(CT,"iras","ir") ; return true ;
+ if (suffix(CT, "iras") && suffix(RV, "iras") && suffixPreceded(CT, "iras", "e")) {
+ CT = replaceSuffix(CT, "iras", "ir");
+ return true;
}
}
// suffix length = 3
if (CT.length() >= 3) {
- if (suffix(CT,"eza") && suffix(R2,"eza")) {
- CT = removeSuffix(CT,"eza") ; return true ;
+ if (suffix(CT, "eza") && suffix(R2, "eza")) {
+ CT = removeSuffix(CT, "eza");
+ return true;
}
- if (suffix(CT,"ico") && suffix(R2,"ico")) {
- CT = removeSuffix(CT,"ico") ; return true ;
+ if (suffix(CT, "ico") && suffix(R2, "ico")) {
+ CT = removeSuffix(CT, "ico");
+ return true;
}
- if (suffix(CT,"ica") && suffix(R2,"ica")) {
- CT = removeSuffix(CT,"ica") ; return true ;
+ if (suffix(CT, "ica") && suffix(R2, "ica")) {
+ CT = removeSuffix(CT, "ica");
+ return true;
}
- if (suffix(CT,"oso") && suffix(R2,"oso")) {
- CT = removeSuffix(CT,"oso") ; return true ;
+ if (suffix(CT, "oso") && suffix(R2, "oso")) {
+ CT = removeSuffix(CT, "oso");
+ return true;
}
- if (suffix(CT,"osa") && suffix(R2,"osa")) {
- CT = removeSuffix(CT,"osa") ; return true ;
+ if (suffix(CT, "osa") && suffix(R2, "osa")) {
+ CT = removeSuffix(CT, "osa");
+ return true;
}
- if (suffix(CT,"iva") && suffix(R2,"iva")) {
- CT = removeSuffix(CT,"iva") ; return true ;
+ if (suffix(CT, "iva") && suffix(R2, "iva")) {
+ CT = removeSuffix(CT, "iva");
+ return true;
}
- if (suffix(CT,"ivo") && suffix(R2,"ivo")) {
- CT = removeSuffix(CT,"ivo") ; return true ;
+ if (suffix(CT, "ivo") && suffix(R2, "ivo")) {
+ CT = removeSuffix(CT, "ivo");
+ return true;
}
- if (suffix(CT,"ira") &&
- suffix(RV,"ira") &&
- suffixPreceded(CT,"ira","e")) {
- CT = replaceSuffix(CT,"ira","ir") ; return true ;
+ if (suffix(CT, "ira") && suffix(RV, "ira") && suffixPreceded(CT, "ira", "e")) {
+ CT = replaceSuffix(CT, "ira", "ir");
+ return true;
}
}
// no ending was removed by step1
- return false ;
+ return false;
}
-
/**
* Verb suffixes.
*
- * Search for the longest among the following suffixes in RV,
- * and if found, delete.
+ * <p>Search for the longest among the following suffixes in RV, and if found, delete.
*
* @return false if no ending was removed
- */
+ */
private boolean step2() {
- if (RV == null) return false ;
+ if (RV == null) return false;
// suffix lenght = 7
if (RV.length() >= 7) {
- if (suffix(RV,"issemos")) {
- CT = removeSuffix(CT,"issemos") ; return true;
+ if (suffix(RV, "issemos")) {
+ CT = removeSuffix(CT, "issemos");
+ return true;
}
- if (suffix(RV,"essemos")) {
- CT = removeSuffix(CT,"essemos") ; return true;
+ if (suffix(RV, "essemos")) {
+ CT = removeSuffix(CT, "essemos");
+ return true;
}
- if (suffix(RV,"assemos")) {
- CT = removeSuffix(CT,"assemos") ; return true;
+ if (suffix(RV, "assemos")) {
+ CT = removeSuffix(CT, "assemos");
+ return true;
}
- if (suffix(RV,"ariamos")) {
- CT = removeSuffix(CT,"ariamos") ; return true;
+ if (suffix(RV, "ariamos")) {
+ CT = removeSuffix(CT, "ariamos");
+ return true;
}
- if (suffix(RV,"eriamos")) {
- CT = removeSuffix(CT,"eriamos") ; return true;
+ if (suffix(RV, "eriamos")) {
+ CT = removeSuffix(CT, "eriamos");
+ return true;
}
- if (suffix(RV,"iriamos")) {
- CT = removeSuffix(CT,"iriamos") ; return true;
+ if (suffix(RV, "iriamos")) {
+ CT = removeSuffix(CT, "iriamos");
+ return true;
}
}
// suffix length = 6
if (RV.length() >= 6) {
- if (suffix(RV,"iremos")) {
- CT = removeSuffix(CT,"iremos") ; return true;
+ if (suffix(RV, "iremos")) {
+ CT = removeSuffix(CT, "iremos");
+ return true;
}
- if (suffix(RV,"eremos")) {
- CT = removeSuffix(CT,"eremos") ; return true;
+ if (suffix(RV, "eremos")) {
+ CT = removeSuffix(CT, "eremos");
+ return true;
}
- if (suffix(RV,"aremos")) {
- CT = removeSuffix(CT,"aremos") ; return true;
+ if (suffix(RV, "aremos")) {
+ CT = removeSuffix(CT, "aremos");
+ return true;
}
- if (suffix(RV,"avamos")) {
- CT = removeSuffix(CT,"avamos") ; return true;
+ if (suffix(RV, "avamos")) {
+ CT = removeSuffix(CT, "avamos");
+ return true;
}
- if (suffix(RV,"iramos")) {
- CT = removeSuffix(CT,"iramos") ; return true;
+ if (suffix(RV, "iramos")) {
+ CT = removeSuffix(CT, "iramos");
+ return true;
}
- if (suffix(RV,"eramos")) {
- CT = removeSuffix(CT,"eramos") ; return true;
+ if (suffix(RV, "eramos")) {
+ CT = removeSuffix(CT, "eramos");
+ return true;
}
- if (suffix(RV,"aramos")) {
- CT = removeSuffix(CT,"aramos") ; return true;
+ if (suffix(RV, "aramos")) {
+ CT = removeSuffix(CT, "aramos");
+ return true;
}
- if (suffix(RV,"asseis")) {
- CT = removeSuffix(CT,"asseis") ; return true;
+ if (suffix(RV, "asseis")) {
+ CT = removeSuffix(CT, "asseis");
+ return true;
}
- if (suffix(RV,"esseis")) {
- CT = removeSuffix(CT,"esseis") ; return true;
+ if (suffix(RV, "esseis")) {
+ CT = removeSuffix(CT, "esseis");
+ return true;
}
- if (suffix(RV,"isseis")) {
- CT = removeSuffix(CT,"isseis") ; return true;
+ if (suffix(RV, "isseis")) {
+ CT = removeSuffix(CT, "isseis");
+ return true;
}
- if (suffix(RV,"arieis")) {
- CT = removeSuffix(CT,"arieis") ; return true;
+ if (suffix(RV, "arieis")) {
+ CT = removeSuffix(CT, "arieis");
+ return true;
}
- if (suffix(RV,"erieis")) {
- CT = removeSuffix(CT,"erieis") ; return true;
+ if (suffix(RV, "erieis")) {
+ CT = removeSuffix(CT, "erieis");
+ return true;
}
- if (suffix(RV,"irieis")) {
- CT = removeSuffix(CT,"irieis") ; return true;
+ if (suffix(RV, "irieis")) {
+ CT = removeSuffix(CT, "irieis");
+ return true;
}
}
-
// suffix length = 5
if (RV.length() >= 5) {
- if (suffix(RV,"irmos")) {
- CT = removeSuffix(CT,"irmos") ; return true;
+ if (suffix(RV, "irmos")) {
+ CT = removeSuffix(CT, "irmos");
+ return true;
}
- if (suffix(RV,"iamos")) {
- CT = removeSuffix(CT,"iamos") ; return true;
+ if (suffix(RV, "iamos")) {
+ CT = removeSuffix(CT, "iamos");
+ return true;
}
- if (suffix(RV,"armos")) {
- CT = removeSuffix(CT,"armos") ; return true;
+ if (suffix(RV, "armos")) {
+ CT = removeSuffix(CT, "armos");
+ return true;
}
- if (suffix(RV,"ermos")) {
- CT = removeSuffix(CT,"ermos") ; return true;
+ if (suffix(RV, "ermos")) {
+ CT = removeSuffix(CT, "ermos");
+ return true;
}
- if (suffix(RV,"areis")) {
- CT = removeSuffix(CT,"areis") ; return true;
+ if (suffix(RV, "areis")) {
+ CT = removeSuffix(CT, "areis");
+ return true;
}
- if (suffix(RV,"ereis")) {
- CT = removeSuffix(CT,"ereis") ; return true;
+ if (suffix(RV, "ereis")) {
+ CT = removeSuffix(CT, "ereis");
+ return true;
}
- if (suffix(RV,"ireis")) {
- CT = removeSuffix(CT,"ireis") ; return true;
+ if (suffix(RV, "ireis")) {
+ CT = removeSuffix(CT, "ireis");
+ return true;
}
- if (suffix(RV,"asses")) {
- CT = removeSuffix(CT,"asses") ; return true;
+ if (suffix(RV, "asses")) {
+ CT = removeSuffix(CT, "asses");
+ return true;
}
- if (suffix(RV,"esses")) {
- CT = removeSuffix(CT,"esses") ; return true;
+ if (suffix(RV, "esses")) {
+ CT = removeSuffix(CT, "esses");
+ return true;
}
- if (suffix(RV,"isses")) {
- CT = removeSuffix(CT,"isses") ; return true;
+ if (suffix(RV, "isses")) {
+ CT = removeSuffix(CT, "isses");
+ return true;
}
- if (suffix(RV,"astes")) {
- CT = removeSuffix(CT,"astes") ; return true;
+ if (suffix(RV, "astes")) {
+ CT = removeSuffix(CT, "astes");
+ return true;
}
- if (suffix(RV,"assem")) {
- CT = removeSuffix(CT,"assem") ; return true;
+ if (suffix(RV, "assem")) {
+ CT = removeSuffix(CT, "assem");
+ return true;
}
- if (suffix(RV,"essem")) {
- CT = removeSuffix(CT,"essem") ; return true;
+ if (suffix(RV, "essem")) {
+ CT = removeSuffix(CT, "essem");
+ return true;
}
- if (suffix(RV,"issem")) {
- CT = removeSuffix(CT,"issem") ; return true;
+ if (suffix(RV, "issem")) {
+ CT = removeSuffix(CT, "issem");
+ return true;
}
- if (suffix(RV,"ardes")) {
- CT = removeSuffix(CT,"ardes") ; return true;
+ if (suffix(RV, "ardes")) {
+ CT = removeSuffix(CT, "ardes");
+ return true;
}
- if (suffix(RV,"erdes")) {
- CT = removeSuffix(CT,"erdes") ; return true;
+ if (suffix(RV, "erdes")) {
+ CT = removeSuffix(CT, "erdes");
+ return true;
}
- if (suffix(RV,"irdes")) {
- CT = removeSuffix(CT,"irdes") ; return true;
+ if (suffix(RV, "irdes")) {
+ CT = removeSuffix(CT, "irdes");
+ return true;
}
- if (suffix(RV,"ariam")) {
- CT = removeSuffix(CT,"ariam") ; return true;
+ if (suffix(RV, "ariam")) {
+ CT = removeSuffix(CT, "ariam");
+ return true;
}
- if (suffix(RV,"eriam")) {
- CT = removeSuffix(CT,"eriam") ; return true;
+ if (suffix(RV, "eriam")) {
+ CT = removeSuffix(CT, "eriam");
+ return true;
}
- if (suffix(RV,"iriam")) {
- CT = removeSuffix(CT,"iriam") ; return true;
+ if (suffix(RV, "iriam")) {
+ CT = removeSuffix(CT, "iriam");
+ return true;
}
- if (suffix(RV,"arias")) {
- CT = removeSuffix(CT,"arias") ; return true;
+ if (suffix(RV, "arias")) {
+ CT = removeSuffix(CT, "arias");
+ return true;
}
- if (suffix(RV,"erias")) {
- CT = removeSuffix(CT,"erias") ; return true;
+ if (suffix(RV, "erias")) {
+ CT = removeSuffix(CT, "erias");
+ return true;
}
- if (suffix(RV,"irias")) {
- CT = removeSuffix(CT,"irias") ; return true;
+ if (suffix(RV, "irias")) {
+ CT = removeSuffix(CT, "irias");
+ return true;
}
- if (suffix(RV,"estes")) {
- CT = removeSuffix(CT,"estes") ; return true;
+ if (suffix(RV, "estes")) {
+ CT = removeSuffix(CT, "estes");
+ return true;
}
- if (suffix(RV,"istes")) {
- CT = removeSuffix(CT,"istes") ; return true;
+ if (suffix(RV, "istes")) {
+ CT = removeSuffix(CT, "istes");
+ return true;
}
- if (suffix(RV,"areis")) {
- CT = removeSuffix(CT,"areis") ; return true;
+ if (suffix(RV, "areis")) {
+ CT = removeSuffix(CT, "areis");
+ return true;
}
- if (suffix(RV,"aveis")) {
- CT = removeSuffix(CT,"aveis") ; return true;
+ if (suffix(RV, "aveis")) {
+ CT = removeSuffix(CT, "aveis");
+ return true;
}
}
// suffix length = 4
if (RV.length() >= 4) {
- if (suffix(RV,"aria")) {
- CT = removeSuffix(CT,"aria") ; return true;
+ if (suffix(RV, "aria")) {
+ CT = removeSuffix(CT, "aria");
+ return true;
}
- if (suffix(RV,"eria")) {
- CT = removeSuffix(CT,"eria") ; return true;
+ if (suffix(RV, "eria")) {
+ CT = removeSuffix(CT, "eria");
+ return true;
}
- if (suffix(RV,"iria")) {
- CT = removeSuffix(CT,"iria") ; return true;
+ if (suffix(RV, "iria")) {
+ CT = removeSuffix(CT, "iria");
+ return true;
}
- if (suffix(RV,"asse")) {
- CT = removeSuffix(CT,"asse") ; return true;
+ if (suffix(RV, "asse")) {
+ CT = removeSuffix(CT, "asse");
+ return true;
}
- if (suffix(RV,"esse")) {
- CT = removeSuffix(CT,"esse") ; return true;
+ if (suffix(RV, "esse")) {
+ CT = removeSuffix(CT, "esse");
+ return true;
}
- if (suffix(RV,"isse")) {
- CT = removeSuffix(CT,"isse") ; return true;
+ if (suffix(RV, "isse")) {
+ CT = removeSuffix(CT, "isse");
+ return true;
}
- if (suffix(RV,"aste")) {
- CT = removeSuffix(CT,"aste") ; return true;
+ if (suffix(RV, "aste")) {
+ CT = removeSuffix(CT, "aste");
+ return true;
}
- if (suffix(RV,"este")) {
- CT = removeSuffix(CT,"este") ; return true;
+ if (suffix(RV, "este")) {
+ CT = removeSuffix(CT, "este");
+ return true;
}
- if (suffix(RV,"iste")) {
- CT = removeSuffix(CT,"iste") ; return true;
+ if (suffix(RV, "iste")) {
+ CT = removeSuffix(CT, "iste");
+ return true;
}
- if (suffix(RV,"arei")) {
- CT = removeSuffix(CT,"arei") ; return true;
+ if (suffix(RV, "arei")) {
+ CT = removeSuffix(CT, "arei");
+ return true;
}
- if (suffix(RV,"erei")) {
- CT = removeSuffix(CT,"erei") ; return true;
+ if (suffix(RV, "erei")) {
+ CT = removeSuffix(CT, "erei");
+ return true;
}
- if (suffix(RV,"irei")) {
- CT = removeSuffix(CT,"irei") ; return true;
+ if (suffix(RV, "irei")) {
+ CT = removeSuffix(CT, "irei");
+ return true;
}
- if (suffix(RV,"aram")) {
- CT = removeSuffix(CT,"aram") ; return true;
+ if (suffix(RV, "aram")) {
+ CT = removeSuffix(CT, "aram");
+ return true;
}
- if (suffix(RV,"eram")) {
- CT = removeSuffix(CT,"eram") ; return true;
+ if (suffix(RV, "eram")) {
+ CT = removeSuffix(CT, "eram");
+ return true;
}
- if (suffix(RV,"iram")) {
- CT = removeSuffix(CT,"iram") ; return true;
+ if (suffix(RV, "iram")) {
+ CT = removeSuffix(CT, "iram");
+ return true;
}
- if (suffix(RV,"avam")) {
- CT = removeSuffix(CT,"avam") ; return true;
+ if (suffix(RV, "avam")) {
+ CT = removeSuffix(CT, "avam");
+ return true;
}
- if (suffix(RV,"arem")) {
- CT = removeSuffix(CT,"arem") ; return true;
+ if (suffix(RV, "arem")) {
+ CT = removeSuffix(CT, "arem");
+ return true;
}
- if (suffix(RV,"erem")) {
- CT = removeSuffix(CT,"erem") ; return true;
+ if (suffix(RV, "erem")) {
+ CT = removeSuffix(CT, "erem");
+ return true;
}
- if (suffix(RV,"irem")) {
- CT = removeSuffix(CT,"irem") ; return true;
+ if (suffix(RV, "irem")) {
+ CT = removeSuffix(CT, "irem");
+ return true;
}
- if (suffix(RV,"ando")) {
- CT = removeSuffix(CT,"ando") ; return true;
+ if (suffix(RV, "ando")) {
+ CT = removeSuffix(CT, "ando");
+ return true;
}
- if (suffix(RV,"endo")) {
- CT = removeSuffix(CT,"endo") ; return true;
+ if (suffix(RV, "endo")) {
+ CT = removeSuffix(CT, "endo");
+ return true;
}
- if (suffix(RV,"indo")) {
- CT = removeSuffix(CT,"indo") ; return true;
+ if (suffix(RV, "indo")) {
+ CT = removeSuffix(CT, "indo");
+ return true;
}
- if (suffix(RV,"arao")) {
- CT = removeSuffix(CT,"arao") ; return true;
+ if (suffix(RV, "arao")) {
+ CT = removeSuffix(CT, "arao");
+ return true;
}
- if (suffix(RV,"erao")) {
- CT = removeSuffix(CT,"erao") ; return true;
+ if (suffix(RV, "erao")) {
+ CT = removeSuffix(CT, "erao");
+ return true;
}
- if (suffix(RV,"irao")) {
- CT = removeSuffix(CT,"irao") ; return true;
+ if (suffix(RV, "irao")) {
+ CT = removeSuffix(CT, "irao");
+ return true;
}
- if (suffix(RV,"adas")) {
- CT = removeSuffix(CT,"adas") ; return true;
+ if (suffix(RV, "adas")) {
+ CT = removeSuffix(CT, "adas");
+ return true;
}
- if (suffix(RV,"idas")) {
- CT = removeSuffix(CT,"idas") ; return true;
+ if (suffix(RV, "idas")) {
+ CT = removeSuffix(CT, "idas");
+ return true;
}
- if (suffix(RV,"aras")) {
- CT = removeSuffix(CT,"aras") ; return true;
+ if (suffix(RV, "aras")) {
+ CT = removeSuffix(CT, "aras");
+ return true;
}
- if (suffix(RV,"eras")) {
- CT = removeSuffix(CT,"eras") ; return true;
+ if (suffix(RV, "eras")) {
+ CT = removeSuffix(CT, "eras");
+ return true;
}
- if (suffix(RV,"iras")) {
- CT = removeSuffix(CT,"iras") ; return true;
+ if (suffix(RV, "iras")) {
+ CT = removeSuffix(CT, "iras");
+ return true;
}
- if (suffix(RV,"avas")) {
- CT = removeSuffix(CT,"avas") ; return true;
+ if (suffix(RV, "avas")) {
+ CT = removeSuffix(CT, "avas");
+ return true;
}
- if (suffix(RV,"ares")) {
- CT = removeSuffix(CT,"ares") ; return true;
+ if (suffix(RV, "ares")) {
+ CT = removeSuffix(CT, "ares");
+ return true;
}
- if (suffix(RV,"eres")) {
- CT = removeSuffix(CT,"eres") ; return true;
+ if (suffix(RV, "eres")) {
+ CT = removeSuffix(CT, "eres");
+ return true;
}
- if (suffix(RV,"ires")) {
- CT = removeSuffix(CT,"ires") ; return true;
+ if (suffix(RV, "ires")) {
+ CT = removeSuffix(CT, "ires");
+ return true;
}
- if (suffix(RV,"ados")) {
- CT = removeSuffix(CT,"ados") ; return true;
+ if (suffix(RV, "ados")) {
+ CT = removeSuffix(CT, "ados");
+ return true;
}
- if (suffix(RV,"idos")) {
- CT = removeSuffix(CT,"idos") ; return true;
+ if (suffix(RV, "idos")) {
+ CT = removeSuffix(CT, "idos");
+ return true;
}
- if (suffix(RV,"amos")) {
- CT = removeSuffix(CT,"amos") ; return true;
+ if (suffix(RV, "amos")) {
+ CT = removeSuffix(CT, "amos");
+ return true;
}
- if (suffix(RV,"emos")) {
- CT = removeSuffix(CT,"emos") ; return true;
+ if (suffix(RV, "emos")) {
+ CT = removeSuffix(CT, "emos");
+ return true;
}
- if (suffix(RV,"imos")) {
- CT = removeSuffix(CT,"imos") ; return true;
+ if (suffix(RV, "imos")) {
+ CT = removeSuffix(CT, "imos");
+ return true;
}
- if (suffix(RV,"iras")) {
- CT = removeSuffix(CT,"iras") ; return true;
+ if (suffix(RV, "iras")) {
+ CT = removeSuffix(CT, "iras");
+ return true;
}
- if (suffix(RV,"ieis")) {
- CT = removeSuffix(CT,"ieis") ; return true;
+ if (suffix(RV, "ieis")) {
+ CT = removeSuffix(CT, "ieis");
+ return true;
}
}
// suffix length = 3
if (RV.length() >= 3) {
- if (suffix(RV,"ada")) {
- CT = removeSuffix(CT,"ada") ; return true;
+ if (suffix(RV, "ada")) {
+ CT = removeSuffix(CT, "ada");
+ return true;
}
- if (suffix(RV,"ida")) {
- CT = removeSuffix(CT,"ida") ; return true;
+ if (suffix(RV, "ida")) {
+ CT = removeSuffix(CT, "ida");
+ return true;
}
- if (suffix(RV,"ara")) {
- CT = removeSuffix(CT,"ara") ; return true;
+ if (suffix(RV, "ara")) {
+ CT = removeSuffix(CT, "ara");
+ return true;
}
- if (suffix(RV,"era")) {
- CT = removeSuffix(CT,"era") ; return true;
+ if (suffix(RV, "era")) {
+ CT = removeSuffix(CT, "era");
+ return true;
}
- if (suffix(RV,"ira")) {
- CT = removeSuffix(CT,"ava") ; return true;
+ if (suffix(RV, "ira")) {
+ CT = removeSuffix(CT, "ava");
+ return true;
}
- if (suffix(RV,"iam")) {
- CT = removeSuffix(CT,"iam") ; return true;
+ if (suffix(RV, "iam")) {
+ CT = removeSuffix(CT, "iam");
+ return true;
}
- if (suffix(RV,"ado")) {
- CT = removeSuffix(CT,"ado") ; return true;
+ if (suffix(RV, "ado")) {
+ CT = removeSuffix(CT, "ado");
+ return true;
}
- if (suffix(RV,"ido")) {
- CT = removeSuffix(CT,"ido") ; return true;
+ if (suffix(RV, "ido")) {
+ CT = removeSuffix(CT, "ido");
+ return true;
}
- if (suffix(RV,"ias")) {
- CT = removeSuffix(CT,"ias") ; return true;
+ if (suffix(RV, "ias")) {
+ CT = removeSuffix(CT, "ias");
+ return true;
}
- if (suffix(RV,"ais")) {
- CT = removeSuffix(CT,"ais") ; return true;
+ if (suffix(RV, "ais")) {
+ CT = removeSuffix(CT, "ais");
+ return true;
}
- if (suffix(RV,"eis")) {
- CT = removeSuffix(CT,"eis") ; return true;
+ if (suffix(RV, "eis")) {
+ CT = removeSuffix(CT, "eis");
+ return true;
}
- if (suffix(RV,"ira")) {
- CT = removeSuffix(CT,"ira") ; return true;
+ if (suffix(RV, "ira")) {
+ CT = removeSuffix(CT, "ira");
+ return true;
}
- if (suffix(RV,"ear")) {
- CT = removeSuffix(CT,"ear") ; return true;
+ if (suffix(RV, "ear")) {
+ CT = removeSuffix(CT, "ear");
+ return true;
}
}
// suffix length = 2
if (RV.length() >= 2) {
- if (suffix(RV,"ia")) {
- CT = removeSuffix(CT,"ia") ; return true;
+ if (suffix(RV, "ia")) {
+ CT = removeSuffix(CT, "ia");
+ return true;
}
- if (suffix(RV,"ei")) {
- CT = removeSuffix(CT,"ei") ; return true;
+ if (suffix(RV, "ei")) {
+ CT = removeSuffix(CT, "ei");
+ return true;
}
- if (suffix(RV,"am")) {
- CT = removeSuffix(CT,"am") ; return true;
+ if (suffix(RV, "am")) {
+ CT = removeSuffix(CT, "am");
+ return true;
}
- if (suffix(RV,"em")) {
- CT = removeSuffix(CT,"em") ; return true;
+ if (suffix(RV, "em")) {
+ CT = removeSuffix(CT, "em");
+ return true;
}
- if (suffix(RV,"ar")) {
- CT = removeSuffix(CT,"ar") ; return true;
+ if (suffix(RV, "ar")) {
+ CT = removeSuffix(CT, "ar");
+ return true;
}
- if (suffix(RV,"er")) {
- CT = removeSuffix(CT,"er") ; return true;
+ if (suffix(RV, "er")) {
+ CT = removeSuffix(CT, "er");
+ return true;
}
- if (suffix(RV,"ir")) {
- CT = removeSuffix(CT,"ir") ; return true;
+ if (suffix(RV, "ir")) {
+ CT = removeSuffix(CT, "ir");
+ return true;
}
- if (suffix(RV,"as")) {
- CT = removeSuffix(CT,"as") ; return true;
+ if (suffix(RV, "as")) {
+ CT = removeSuffix(CT, "as");
+ return true;
}
- if (suffix(RV,"es")) {
- CT = removeSuffix(CT,"es") ; return true;
+ if (suffix(RV, "es")) {
+ CT = removeSuffix(CT, "es");
+ return true;
}
- if (suffix(RV,"is")) {
- CT = removeSuffix(CT,"is") ; return true;
+ if (suffix(RV, "is")) {
+ CT = removeSuffix(CT, "is");
+ return true;
}
- if (suffix(RV,"eu")) {
- CT = removeSuffix(CT,"eu") ; return true;
+ if (suffix(RV, "eu")) {
+ CT = removeSuffix(CT, "eu");
+ return true;
}
- if (suffix(RV,"iu")) {
- CT = removeSuffix(CT,"iu") ; return true;
+ if (suffix(RV, "iu")) {
+ CT = removeSuffix(CT, "iu");
+ return true;
}
- if (suffix(RV,"iu")) {
- CT = removeSuffix(CT,"iu") ; return true;
+ if (suffix(RV, "iu")) {
+ CT = removeSuffix(CT, "iu");
+ return true;
}
- if (suffix(RV,"ou")) {
- CT = removeSuffix(CT,"ou") ; return true;
+ if (suffix(RV, "ou")) {
+ CT = removeSuffix(CT, "ou");
+ return true;
}
}
// no ending was removed by step2
- return false ;
+ return false;
}
- /**
- * Delete suffix 'i' if in RV and preceded by 'c'
- *
- */
+ /** Delete suffix 'i' if in RV and preceded by 'c' */
private void step3() {
- if (RV == null) return ;
+ if (RV == null) return;
- if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) {
- CT = removeSuffix(CT,"i") ;
+ if (suffix(RV, "i") && suffixPreceded(RV, "i", "c")) {
+ CT = removeSuffix(CT, "i");
}
-
}
/**
* Residual suffix
*
- * If the word ends with one of the suffixes (os a i o á í ó)
- * in RV, delete it
- *
- */
+ * <p>If the word ends with one of the suffixes (os a i o á í ó) in RV, delete it
+ */
private void step4() {
- if (RV == null) return ;
+ if (RV == null) return;
- if (suffix(RV,"os")) {
- CT = removeSuffix(CT,"os") ; return ;
+ if (suffix(RV, "os")) {
+ CT = removeSuffix(CT, "os");
+ return;
}
- if (suffix(RV,"a")) {
- CT = removeSuffix(CT,"a") ; return ;
+ if (suffix(RV, "a")) {
+ CT = removeSuffix(CT, "a");
+ return;
}
- if (suffix(RV,"i")) {
- CT = removeSuffix(CT,"i") ; return ;
+ if (suffix(RV, "i")) {
+ CT = removeSuffix(CT, "i");
+ return;
}
- if (suffix(RV,"o")) {
- CT = removeSuffix(CT,"o") ; return ;
+ if (suffix(RV, "o")) {
+ CT = removeSuffix(CT, "o");
+ return;
}
-
}
/**
- * If the word ends with one of ( e é ê) in RV,delete it,
- * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
- * delete the 'u' (or 'i')
- *
- * Or if the word ends ç remove the cedilha
+ * If the word ends with one of ( e é ê) in RV,delete it, and if preceded by 'gu' (or 'ci') with
+ * the 'u' (or 'i') in RV, delete the 'u' (or 'i')
*
- */
+ * <p>Or if the word ends ç remove the cedilha
+ */
private void step5() {
- if (RV == null) return ;
+ if (RV == null) return;
- if (suffix(RV,"e")) {
- if (suffixPreceded(RV,"e","gu")) {
- CT = removeSuffix(CT,"e") ;
- CT = removeSuffix(CT,"u") ;
- return ;
+ if (suffix(RV, "e")) {
+ if (suffixPreceded(RV, "e", "gu")) {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "u");
+ return;
}
- if (suffixPreceded(RV,"e","ci")) {
- CT = removeSuffix(CT,"e") ;
- CT = removeSuffix(CT,"i") ;
- return ;
+ if (suffixPreceded(RV, "e", "ci")) {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "i");
+ return;
}
- CT = removeSuffix(CT,"e") ; return ;
+ CT = removeSuffix(CT, "e");
+ return;
}
}
/**
* For log and debug purpose
*
- * @return TERM, CT, RV, R1 and R2
+ * @return TERM, CT, RV, R1 and R2
*/
public String log() {
- return " (TERM = " + TERM + ")" +
- " (CT = " + CT +")" +
- " (RV = " + RV +")" +
- " (R1 = " + R1 +")" +
- " (R2 = " + R2 +")" ;
+ return " (TERM = "
+ + TERM
+ + ")"
+ + " (CT = "
+ + CT
+ + ")"
+ + " (RV = "
+ + RV
+ + ")"
+ + " (R1 = "
+ + R1
+ + ")"
+ + " (R2 = "
+ + R2
+ + ")";
}
-
}
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/package-info.java
index 080389b..a7ef132 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Brazilian Portuguese.
- */
+/** Analyzer for Brazilian Portuguese. */
package org.apache.lucene.analysis.br;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
index 8d90e95..a1f71fe 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@@ -16,11 +16,9 @@
*/
package org.apache.lucene.analysis.ca;
-
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -41,35 +39,34 @@ import org.tartarus.snowball.ext.CatalanStemmer;
*/
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
-
+
/** File containing default Catalan stopwords. */
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
-
- private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
- new CharArraySet(
- Arrays.asList(
- "d", "l", "m", "n", "s", "t"
- ), true));
-
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ private static final CharArraySet DEFAULT_ARTICLES =
+ CharArraySet.unmodifiableSet(
+ new CharArraySet(Arrays.asList("d", "l", "m", "n", "s", "t"), true));
+
/**
* Returns an unmodifiable instance of the default stop words set.
+ *
* @return default stop words set.
*/
- public static CharArraySet getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
/**
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
- * accesses the static final set the first time.;
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+ * static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadStopwordSet(false,
- CatalanAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+ DEFAULT_STOP_SET =
+ loadStopwordSet(false, CatalanAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -78,16 +75,14 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
}
}
- /**
- * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
- */
+ /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public CatalanAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
-
+
/**
* Builds an analyzer with the given stop words.
- *
+ *
* @param stopwords a stopword set
*/
public CatalanAnalyzer(CharArraySet stopwords) {
@@ -95,10 +90,9 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
- * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
- * stemming.
- *
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
+ * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming.
+ *
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
@@ -108,16 +102,13 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Creates a
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from an {@link StandardTokenizer} filtered with
- * {@link ElisionFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link SnowballFilter}.
+ * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
+ * the text in the provided {@link Reader}.
+ *
+ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
+ * {@link StandardTokenizer} filtered with {@link ElisionFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
+ * {@link SnowballFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
@@ -125,8 +116,9 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
TokenStream result = new ElisionFilter(source, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopwords);
- if(!stemExclusionSet.isEmpty())
+ if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
result = new SnowballFilter(result, new CatalanStemmer());
return new TokenStreamComponents(source, result);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/package-info.java
index 7f8b0da..312e683 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Catalan.
- */
+/** Analyzer for Catalan. */
package org.apache.lucene.analysis.ca;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
index 4fba9fe..b9ce02b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
@@ -16,24 +16,22 @@
*/
package org.apache.lucene.analysis.charfilter;
-import org.apache.lucene.analysis.CharFilter;
-import org.apache.lucene.util.ArrayUtil;
-
import java.io.Reader;
import java.util.Arrays;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.util.ArrayUtil;
/**
- * Base utility class for implementing a {@link CharFilter}.
- * You subclass this, and then record mappings by calling
- * {@link #addOffCorrectMap}, and then invoke the correct
- * method to correct an offset.
+ * Base utility class for implementing a {@link CharFilter}. You subclass this, and then record
+ * mappings by calling {@link #addOffCorrectMap}, and then invoke the correct method to correct an
+ * offset.
*/
public abstract class BaseCharFilter extends CharFilter {
private int offsets[];
private int diffs[];
private int size = 0;
-
+
public BaseCharFilter(Reader in) {
super(in);
}
@@ -53,24 +51,19 @@ public abstract class BaseCharFilter extends CharFilter {
final int diff = index < 0 ? 0 : diffs[index];
return currentOff + diff;
}
-
+
protected int getLastCumulativeDiff() {
- return offsets == null ?
- 0 : diffs[size-1];
+ return offsets == null ? 0 : diffs[size - 1];
}
/**
- * <p>
- * Adds an offset correction mapping at the given output stream offset.
- * </p>
- * <p>
- * Assumption: the offset given with each successive call to this method
- * will not be smaller than the offset given at the previous invocation.
- * </p>
+ * Adds an offset correction mapping at the given output stream offset.
+ *
+ * <p>Assumption: the offset given with each successive call to this method will not be smaller
+ * than the offset given at the previous invocation.
*
* @param off The output stream offset at which to apply the correction
- * @param cumulativeDiff The input offset is given by adding this
- * to the output offset
+ * @param cumulativeDiff The input offset is given by adding this to the output offset
*/
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
@@ -80,11 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
offsets = ArrayUtil.grow(offsets);
diffs = ArrayUtil.grow(diffs);
}
-
+
assert (size == 0 || off >= offsets[size - 1])
- : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
- + offsets[size - 1] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
-
+ : "Offset #"
+ + size
+ + "("
+ + off
+ + ") is less than the last recorded offset "
+ + offsets[size - 1]
+ + "\n"
+ + Arrays.toString(offsets)
+ + "\n"
+ + Arrays.toString(diffs);
+
if (size == 0 || off != offsets[size - 1]) {
offsets[size] = off;
diffs[size++] = cumulativeDiff;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
index 1712227..84fc1cf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
@@ -16,16 +16,15 @@
*/
package org.apache.lucene.analysis.charfilter;
-
-import org.apache.lucene.analysis.CharFilterFactory;
-
import java.io.Reader;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
+import org.apache.lucene.analysis.CharFilterFactory;
/**
- * Factory for {@link HTMLStripCharFilter}.
+ * Factory for {@link HTMLStripCharFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -44,9 +43,9 @@ public class HTMLStripCharFilterFactory extends CharFilterFactory {
final Set<String> escapedTags;
static final Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
-
+
/** Creates a new HTMLStripCharFilterFactory */
- public HTMLStripCharFilterFactory(Map<String,String> args) {
+ public HTMLStripCharFilterFactory(Map<String, String> args) {
super(args);
escapedTags = getSet(args, "escapedTags");
if (!args.isEmpty()) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
index 5fffda9..04b7368 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.io.Reader;
import java.util.Map;
-
import org.apache.lucene.analysis.CharFilter; // javadocs
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.CharsRef;
@@ -28,14 +27,10 @@ import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/**
- * Simplistic {@link CharFilter} that applies the mappings
- * contained in a {@link NormalizeCharMap} to the character
- * stream, and correcting the resulting changes to the
- * offsets. Matching is greedy (longest pattern matching at
- * a given point wins). Replacement is allowed to be the
- * empty string.
+ * Simplistic {@link CharFilter} that applies the mappings contained in a {@link NormalizeCharMap}
+ * to the character stream, and correcting the resulting changes to the offsets. Matching is greedy
+ * (longest pattern matching at a given point wins). Replacement is allowed to be the empty string.
*/
-
public class MappingCharFilter extends BaseCharFilter {
private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
@@ -43,7 +38,7 @@ public class MappingCharFilter extends BaseCharFilter {
private final FST.BytesReader fstReader;
private final RollingCharBuffer buffer = new RollingCharBuffer();
private final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
- private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs;
+ private final Map<Character, FST.Arc<CharsRef>> cachedRootArcs;
private CharsRef replacement;
private int replacementPointer;
@@ -75,11 +70,12 @@ public class MappingCharFilter extends BaseCharFilter {
@Override
public int read() throws IOException {
- //System.out.println("\nread");
- while(true) {
+ // System.out.println("\nread");
+ while (true) {
if (replacement != null && replacementPointer < replacement.length) {
- //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
+ // System.out.println(" return repl[" + replacementPointer + "]=" +
+ // replacement.chars[replacement.offset + replacementPointer]);
return replacement.chars[replacement.offset + replacementPointer++];
}
@@ -141,7 +137,7 @@ public class MappingCharFilter extends BaseCharFilter {
if (lastMatch != null) {
inputOff += lastMatchLen;
- //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
+ // System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
final int diff = lastMatchLen - lastMatch.length;
@@ -155,7 +151,7 @@ public class MappingCharFilter extends BaseCharFilter {
// the "extra" chars all back to the same input
// offset:
final int outputStart = inputOff - prevCumulativeDiff;
- for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
+ for (int extraIDX = 0; extraIDX < -diff; extraIDX++) {
addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
}
}
@@ -178,7 +174,7 @@ public class MappingCharFilter extends BaseCharFilter {
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
- for(int i = off; i < off + len; i++) {
+ for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
index 8296719..fa77a18 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.charfilter;
-
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
@@ -24,13 +23,13 @@ import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
/**
- * Factory for {@link MappingCharFilter}.
+ * Factory for {@link MappingCharFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -42,8 +41,7 @@ import org.apache.lucene.util.ResourceLoaderAware;
* @since Solr 1.4
* @lucene.spi {@value #NAME}
*/
-public class MappingCharFilterFactory extends CharFilterFactory implements
- ResourceLoaderAware {
+public class MappingCharFilterFactory extends CharFilterFactory implements ResourceLoaderAware {
/** SPI name */
public static final String NAME = "mapping";
@@ -52,7 +50,7 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
private final String mapping;
/** Creates a new MappingCharFilterFactory */
- public MappingCharFilterFactory(Map<String,String> args) {
+ public MappingCharFilterFactory(Map<String, String> args) {
super(args);
mapping = get(args, "mapping");
if (!args.isEmpty()) {
@@ -90,7 +88,7 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
public Reader create(Reader input) {
// if the map is null, it means there's actually no mappings... just return the original stream
// as there is nothing to do here.
- return normMap == null ? input : new MappingCharFilter(normMap,input);
+ return normMap == null ? input : new MappingCharFilter(normMap, input);
}
@Override
@@ -99,48 +97,62 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
}
// "source" => "target"
- static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
+ static Pattern p = Pattern.compile("\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$");
- protected void parseRules( List<String> rules, NormalizeCharMap.Builder builder ){
- for( String rule : rules ){
- Matcher m = p.matcher( rule );
- if( !m.find() )
- throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
- builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
+ protected void parseRules(List<String> rules, NormalizeCharMap.Builder builder) {
+ for (String rule : rules) {
+ Matcher m = p.matcher(rule);
+ if (!m.find())
+ throw new IllegalArgumentException(
+ "Invalid Mapping Rule : [" + rule + "], file = " + mapping);
+ builder.add(parseString(m.group(1)), parseString(m.group(2)));
}
}
char[] out = new char[256];
-
- protected String parseString( String s ){
+
+ protected String parseString(String s) {
int readPos = 0;
int len = s.length();
int writePos = 0;
- while( readPos < len ){
- char c = s.charAt( readPos++ );
- if( c == '\\' ){
- if( readPos >= len )
+ while (readPos < len) {
+ char c = s.charAt(readPos++);
+ if (c == '\\') {
+ if (readPos >= len)
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
- c = s.charAt( readPos++ );
- switch( c ) {
- case '\\' : c = '\\'; break;
- case '"' : c = '"'; break;
- case 'n' : c = '\n'; break;
- case 't' : c = '\t'; break;
- case 'r' : c = '\r'; break;
- case 'b' : c = '\b'; break;
- case 'f' : c = '\f'; break;
- case 'u' :
- if( readPos + 3 >= len )
+ c = s.charAt(readPos++);
+ switch (c) {
+ case '\\':
+ c = '\\';
+ break;
+ case '"':
+ c = '"';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'u':
+ if (readPos + 3 >= len)
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
- c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+ c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
readPos += 4;
break;
}
}
out[writePos++] = c;
}
- return new String( out, 0, writePos );
+ return new String(out, 0, writePos);
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
index a217b4c..b3dba24 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
-
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
@@ -32,14 +31,13 @@ import org.apache.lucene.util.fst.Util;
// TODO: save/load?
/**
- * Holds a map of String input to String output, to be used
- * with {@link MappingCharFilter}. Use the {@link Builder}
- * to create this.
+ * Holds a map of String input to String output, to be used with {@link MappingCharFilter}. Use the
+ * {@link Builder} to create this.
*/
public class NormalizeCharMap {
final FST<CharsRef> map;
- final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>();
+ final Map<Character, FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>();
// Use the builder to create:
private NormalizeCharMap(FST<CharsRef> map) {
@@ -52,16 +50,18 @@ public class NormalizeCharMap {
map.getFirstArc(scratchArc);
if (FST.targetHasArcs(scratchArc)) {
map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader);
- while(true) {
+ while (true) {
assert scratchArc.label() != FST.END_LABEL;
- cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc));
+ cachedRootArcs.put(
+ Character.valueOf((char) scratchArc.label()),
+ new FST.Arc<CharsRef>().copyFrom(scratchArc));
if (scratchArc.isLast()) {
break;
}
map.readNextRealArc(scratchArc, fstReader);
}
}
- //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
+ // System.out.println("cached " + cachedRootArcs.size() + " root arcs");
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
throw new RuntimeException(ioe);
@@ -71,27 +71,27 @@ public class NormalizeCharMap {
/**
* Builds an NormalizeCharMap.
- * <p>
- * Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
+ *
+ * <p>Call add() until you have added all the mappings, then call build() to get a
+ * NormalizeCharMap
+ *
* @lucene.experimental
*/
public static class Builder {
- private final Map<String,String> pendingPairs = new TreeMap<>();
+ private final Map<String, String> pendingPairs = new TreeMap<>();
- /** Records a replacement to be applied to the input
- * stream. Whenever <code>singleMatch</code> occurs in
- * the input, it will be replaced with
- * <code>replacement</code>.
+ /**
+ * Records a replacement to be applied to the input stream. Whenever <code>singleMatch</code>
+ * occurs in the input, it will be replaced with <code>replacement</code>.
*
* @param match input String to be replaced
* @param replacement output String
- * @throws IllegalArgumentException if
- * <code>match</code> is the empty string, or was
- * already previously added
+ * @throws IllegalArgumentException if <code>match</code> is the empty string, or was already
+ * previously added
*/
public void add(String match, String replacement) {
- if (match.length() == 0 ){
+ if (match.length() == 0) {
throw new IllegalArgumentException("cannot match the empty string");
}
if (pendingPairs.containsKey(match)) {
@@ -100,8 +100,7 @@ public class NormalizeCharMap {
pendingPairs.put(match, replacement);
}
- /** Builds the NormalizeCharMap; call this once you
- * are done calling {@link #add}. */
+ /** Builds the NormalizeCharMap; call this once you are done calling {@link #add}. */
public NormalizeCharMap build() {
final FST<CharsRef> map;
@@ -109,9 +108,8 @@ public class NormalizeCharMap {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
- for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
- fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
- new CharsRef(ent.getValue()));
+ for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
+ fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));
}
map = fstCompiler.compile();
pendingPairs.clear();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package-info.java
index 12a43a5..54a24d3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package-info.java
@@ -17,42 +17,31 @@
/**
* Normalization of text before the tokenizer.
- * <p>
- * CharFilters are chainable filters that normalize text before tokenization
- * and provide mappings between normalized text offsets and the corresponding
- * offset in the original text.
- * </p>
+ *
+ * <p>CharFilters are chainable filters that normalize text before tokenization and provide mappings
+ * between normalized text offsets and the corresponding offset in the original text.
+ *
* <H2>CharFilter offset mappings</H2>
- * <p>
- * CharFilters modify an input stream via a series of substring
- * replacements (including deletions and insertions) to produce an output
- * stream. There are three possible replacement cases: the replacement
- * string has the same length as the original substring; the replacement
- * is shorter; and the replacement is longer. In the latter two cases
- * (when the replacement has a different length than the original),
- * one or more offset correction mappings are required.
- * </p>
- * <p>
- * When the replacement is shorter than the original (e.g. when the
- * replacement is the empty string), a single offset correction mapping
- * should be added at the replacement's end offset in the output stream.
- * The <code>cumulativeDiff</code> parameter to the
- * <code>addOffCorrectMapping()</code> method will be the sum of all
- * previous replacement offset adjustments, with the addition of the
- * difference between the lengths of the original substring and the
- * replacement string (a positive value).
- * </p>
- * <p>
- * When the replacement is longer than the original (e.g. when the
- * original is the empty string), you should add as many offset
- * correction mappings as the difference between the lengths of the
- * replacement string and the original substring, starting at the
- * end offset the original substring would have had in the output stream.
- * The <code>cumulativeDiff</code> parameter to the
- * <code>addOffCorrectMapping()</code> method will be the sum of all
- * previous replacement offset adjustments, with the addition of the
- * difference between the lengths of the original substring and the
- * replacement string so far (a negative value).
- * </p>
+ *
+ * <p>CharFilters modify an input stream via a series of substring replacements (including deletions
+ * and insertions) to produce an output stream. There are three possible replacement cases: the
+ * replacement string has the same length as the original substring; the replacement is shorter; and
+ * the replacement is longer. In the latter two cases (when the replacement has a different length
+ * than the original), one or more offset correction mappings are required.
+ *
+ * <p>When the replacement is shorter than the original (e.g. when the replacement is the empty
+ * string), a single offset correction mapping should be added at the replacement's end offset in
+ * the output stream. The <code>cumulativeDiff</code> parameter to the <code>addOffCorrectMapping()
+ * </code> method will be the sum of all previous replacement offset adjustments, with the addition
+ * of the difference between the lengths of the original substring and the replacement string (a
+ * positive value).
+ *
+ * <p>When the replacement is longer than the original (e.g. when the original is the empty string),
+ * you should add as many offset correction mappings as the difference between the lengths of the
+ * replacement string and the original substring, starting at the end offset the original substring
+ * would have had in the output stream. The <code>cumulativeDiff</code> parameter to the <code>
+ * addOffCorrectMapping()</code> method will be the sum of all previous replacement offset
+ * adjustments, with the addition of the difference between the lengths of the original substring
+ * and the replacement string so far (a negative value).
*/
package org.apache.lucene.analysis.charfilter;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index 403f43a..4eab251 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -16,9 +16,7 @@
*/
package org.apache.lucene.analysis.cjk;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -29,10 +27,9 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
- * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
- * normalizes content with {@link CJKWidthFilter}, folds case with
- * {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
- * and filters stopwords with {@link StopFilter}
+ * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer}, normalizes content with
+ * {@link CJKWidthFilter}, folds case with {@link LowerCaseFilter}, forms bigrams of CJK with {@link
+ * CJKBigramFilter}, and filters stopwords with {@link StopFilter}
*
* @since 3.1
*/
@@ -40,20 +37,21 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default CJK stopwords.
- * <p>
- * Currently it contains some common English words that are not usually
- * useful for searching and some double-byte interpunctions.
+ *
+ * <p>Currently it contains some common English words that are not usually useful for searching
+ * and some double-byte interpunctions.
*/
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
+ *
* @return an unmodifiable instance of the default stop-words set.
*/
- public static CharArraySet getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
@@ -68,20 +66,17 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
}
}
- /**
- * Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
- */
+ /** Builds an analyzer which removes words in {@link #getDefaultStopSet()}. */
public CJKAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
-
+
/**
* Builds an analyzer with the given stop words
- *
- * @param stopwords
- * a stopword set
+ *
+ * @param stopwords a stopword set
*/
- public CJKAnalyzer(CharArraySet stopwords){
+ public CJKAnalyzer(CharArraySet stopwords) {
super(stopwords);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
index 7d79b84..abbd5d2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
@@ -16,9 +16,7 @@
*/
package org.apache.lucene.analysis.cjk;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -30,28 +28,24 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
/**
- * Forms bigrams of CJK terms that are generated from StandardTokenizer
- * or ICUTokenizer.
- * <p>
- * CJK types are set by these tokenizers, but you can also use
- * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
- * of the CJK scripts are turned into bigrams.
- * <p>
- * By default, when a CJK character has no adjacent characters to form
- * a bigram, it is output in unigram form. If you want to always output
- * both unigrams and bigrams, set the <code>outputUnigrams</code>
- * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
+ * Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer.
+ *
+ * <p>CJK types are set by these tokenizers, but you can also use {@link
+ * #CJKBigramFilter(TokenStream, int)} to explicitly control which of the CJK scripts are turned
+ * into bigrams.
+ *
+ * <p>By default, when a CJK character has no adjacent characters to form a bigram, it is output in
+ * unigram form. If you want to always output both unigrams and bigrams, set the <code>
+ * outputUnigrams</code> flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
- * <p>
- * Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries.
- * Korean Hangul characters are treated the same as many other scripts'
- * letters, and as a result, StandardTokenizer can produce tokens that mix
- * Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens
- * are typed as <code><ALPHANUM></code> rather than
- * <code><HANGUL></code>, and as a result, will not be converted to
- * bigrams by CJKBigramFilter.
*
- * In all cases, all non-CJK input is passed thru unmodified.
+ * <p>Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries. Korean Hangul
+ * characters are treated the same as many other scripts' letters, and as a result,
+ * StandardTokenizer can produce tokens that mix Hangul and non-Hangul characters, e.g. "한국abc".
+ * Such mixed-script tokens are typed as <code><ALPHANUM></code> rather than <code>
+ * <HANGUL></code>, and as a result, will not be converted to bigrams by CJKBigramFilter.
+ *
+ * <p>In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
// configuration
@@ -70,12 +64,15 @@ public final class CJKBigramFilter extends TokenFilter {
public static final String SINGLE_TYPE = "<SINGLE>";
// the types from standardtokenizer
- private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
- private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
- private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+ private static final String HAN_TYPE =
+ StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+ private static final String HIRAGANA_TYPE =
+ StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ private static final String KATAKANA_TYPE =
+ StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
-
- // sentinel value for ignoring a script
+
+ // sentinel value for ignoring a script
private static final Object NO = new Object();
// these are set to either their type or NO if we want to pass them thru
@@ -83,17 +80,18 @@ public final class CJKBigramFilter extends TokenFilter {
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
-
+
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
-
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionIncrementAttribute posIncAtt =
+ addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
-
+
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
int startOffset[] = new int[8];
@@ -102,65 +100,66 @@ public final class CJKBigramFilter extends TokenFilter {
int bufferLen;
// current buffer index
int index;
-
+
// the last end offset, to determine if we should bigram across tokens
int lastEndOffset;
-
+
private boolean exhausted;
-
- /**
- * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
- * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
+
+ /**
+ * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) CJKBigramFilter(in, HAN |
+ * HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
-
- /**
- * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
- * CJKBigramFilter(in, flags, false)}
+
+ /**
+ * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) CJKBigramFilter(in,
+ * flags, false)}
*/
public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
-
+
/**
- * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
- * and whether or not unigrams should also be output.
- * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
- * {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+ * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, and whether
+ * or not unigrams should also be output.
+ *
+ * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
+ * {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
- * when this is false, this is only done when there are no adjacent characters to form
- * a bigram.
+ * when this is false, this is only done when there are no adjacent characters to form a
+ * bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
- doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
+ doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
- doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
+ doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
}
-
+
/*
- * much of this complexity revolves around handling the special case of a
- * "lone cjk character" where cjktokenizer would output a unigram. this
+ * much of this complexity revolves around handling the special case of a
+ * "lone cjk character" where cjktokenizer would output a unigram. this
* is also the only time we ever have to captureState.
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (hasBufferedBigram()) {
-
+
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
-
+
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
- // the logic in hasBufferedUnigram ensures we output the C,
+ // the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
@@ -175,23 +174,23 @@ public final class CJKBigramFilter extends TokenFilter {
}
return true;
} else if (doNext()) {
-
+
// case 2: look at the token type. should we form any n-grams?
-
+
String type = typeAtt.type();
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
-
+
// acceptable CJK type: we form n-grams from these.
// as long as the offsets are aligned, we just add these to our current buffer.
// otherwise, we clear the buffer and start over.
-
+
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
if (hasBufferedUnigram()) {
-
+
// we have a buffered unigram, and we peeked ahead to see if we could form
- // a bigram, but we can't, because the offsets are unaligned. capture the state
+ // a bigram, but we can't, because the offsets are unaligned. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
-
+
loneState = captureState();
flushUnigram();
return true;
@@ -201,15 +200,15 @@ public final class CJKBigramFilter extends TokenFilter {
}
refill();
} else {
-
+
// not a CJK type: we just return these as-is.
-
+
if (hasBufferedUnigram()) {
-
+
// we have a buffered unigram, and we peeked ahead to see if we could form
- // a bigram, but we can't, because it's not a CJK type. capture the state
+ // a bigram, but we can't, because it's not a CJK type. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
-
+
loneState = captureState();
flushUnigram();
return true;
@@ -217,12 +216,12 @@ public final class CJKBigramFilter extends TokenFilter {
return true;
}
} else {
-
- // case 3: we have only zero or 1 codepoints buffered,
+
+ // case 3: we have only zero or 1 codepoints buffered,
// so not enough to form a bigram. But, we also have no
// more input. So if we have a buffered codepoint, emit
// a unigram, otherwise, it's end of stream.
-
+
if (hasBufferedUnigram()) {
flushUnigram(); // flush our remaining unigram
return true;
@@ -231,12 +230,10 @@ public final class CJKBigramFilter extends TokenFilter {
}
}
}
-
+
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
-
- /**
- * looks at next input token, returning false is none is available
- */
+
+ /** looks at next input token, returning false is none is available */
private boolean doNext() throws IOException {
if (loneState != null) {
restoreState(loneState);
@@ -253,10 +250,8 @@ public final class CJKBigramFilter extends TokenFilter {
}
}
}
-
- /**
- * refills buffers with new data from the current token.
- */
+
+ /** refills buffers with new data from the current token. */
private void refill() {
// compact buffers to keep them smallish if they become large
// just a safety check, but technically we only need the last codepoint
@@ -273,7 +268,7 @@ public final class CJKBigramFilter extends TokenFilter {
int len = termAtt.length();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
-
+
int newSize = bufferLen + len;
buffer = ArrayUtil.grow(buffer, newSize);
startOffset = ArrayUtil.grow(startOffset, newSize);
@@ -300,17 +295,17 @@ public final class CJKBigramFilter extends TokenFilter {
}
}
- /**
- * Flushes a bigram token to output from our buffer
- * This is the normal case, e.g. ABC -> AB BC
+ /**
+ * Flushes a bigram token to output from our buffer This is the normal case, e.g. ABC -> AB BC
*/
private void flushBigram() {
clearAttributes();
- char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
+ char termBuffer[] =
+ termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
int len1 = Character.toChars(buffer[index], termBuffer, 0);
- int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
+ int len2 = len1 + Character.toChars(buffer[index + 1], termBuffer, len1);
termAtt.setLength(len2);
- offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
+ offsetAtt.setOffset(startOffset[index], endOffset[index + 1]);
typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
@@ -319,12 +314,11 @@ public final class CJKBigramFilter extends TokenFilter {
}
index++;
}
-
- /**
- * Flushes a unigram token to output from our buffer.
- * This happens when we encounter isolated CJK characters, either the whole
- * CJK string is a single character, or we encounter a CJK character surrounded
- * by space, punctuation, english, etc, but not beside any other CJK.
+
+ /**
+ * Flushes a unigram token to output from our buffer. This happens when we encounter isolated CJK
+ * characters, either the whole CJK string is a single character, or we encounter a CJK character
+ * surrounded by space, punctuation, english, etc, but not beside any other CJK.
*/
private void flushUnigram() {
clearAttributes();
@@ -335,18 +329,15 @@ public final class CJKBigramFilter extends TokenFilter {
typeAtt.setType(SINGLE_TYPE);
index++;
}
-
- /**
- * True if we have multiple codepoints sitting in our buffer
- */
+
+ /** True if we have multiple codepoints sitting in our buffer */
private boolean hasBufferedBigram() {
return bufferLen - index > 1;
}
/**
- * True if we have a single codepoint sitting in our buffer, where its future
- * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
- * inputs.
+ * True if we have a single codepoint sitting in our buffer, where its future (whether it is
+ * emitted as unigram or forms a bigram) depends upon not-yet-seen inputs.
*/
private boolean hasBufferedUnigram() {
if (outputUnigrams) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
index 1eceaf9..1c8699a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
@@ -16,25 +16,25 @@
*/
package org.apache.lucene.analysis.cjk;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
+/**
* Factory for {@link CJKBigramFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_cjk" class="solr.TextField">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.CJKWidthFilterFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.CJKBigramFilterFactory"
- * han="true" hiragana="true"
+ * <filter class="solr.CJKBigramFilterFactory"
+ * han="true" hiragana="true"
* katakana="true" hangul="true" outputUnigrams="false" />
* </analyzer>
* </fieldType></pre>
+ *
* @since 3.6.0
* @lucene.spi {@value #NAME}
*/
@@ -47,7 +47,7 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
final boolean outputUnigrams;
/** Creates a new CJKBigramFilterFactory */
- public CJKBigramFilterFactory(Map<String,String> args) {
+ public CJKBigramFilterFactory(Map<String, String> args) {
super(args);
int flags = 0;
if (getBoolean(args, "han", true)) {
@@ -68,7 +68,7 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public CJKBigramFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
index 4fb7ced..16cd6f2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
@@ -17,19 +17,19 @@
package org.apache.lucene.analysis.cjk;
-import org.apache.lucene.analysis.charfilter.BaseCharFilter;
-
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
/**
* A {@link org.apache.lucene.analysis.CharFilter} that normalizes CJK width differences:
+ *
* <ul>
* <li>Folds fullwidth ASCII variants into the equivalent basic latin
* <li>Folds halfwidth Katakana variants into the equivalent kana
* </ul>
- * <p>
- * NOTE: this char filter is the exact counterpart of {@link CJKWidthFilter}.
+ *
+ * <p>NOTE: this char filter is the exact counterpart of {@link CJKWidthFilter}.
*/
public class CJKWidthCharFilter extends BaseCharFilter {
@@ -39,30 +39,33 @@ public class CJKWidthCharFilter extends BaseCharFilter {
* as a fallback when they cannot properly combine with a preceding
* character into a composed form.
*/
- private static final char KANA_NORM[] = new char[] {
- 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
- 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
- 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
- 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
- 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
- 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
- 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
- };
+ private static final char KANA_NORM[] =
+ new char[] {
+ 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
+ 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
+ 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+ 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
+ 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
+ 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
+ 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
+ };
/* kana combining diffs: 0x30A6-0x30FD */
- private static final byte KANA_COMBINE_VOICED[] = new byte[] {
- 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
- 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
- };
-
- private static final byte KANA_COMBINE_SEMI_VOICED[] = new byte[] {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
- 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
+ private static final byte KANA_COMBINE_VOICED[] =
+ new byte[] {
+ 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
+ 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+ };
+
+ private static final byte KANA_COMBINE_SEMI_VOICED[] =
+ new byte[] {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
+ 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
private static final int HW_KATAKANA_VOICED_MARK = 0xFF9E;
private static final int HW_KATAKANA_SEMI_VOICED_MARK = 0xFF9F;
@@ -77,7 +80,7 @@ public class CJKWidthCharFilter extends BaseCharFilter {
@Override
public int read() throws IOException {
- while(true) {
+ while (true) {
final int ch = input.read();
if (ch == -1) {
// reached end of the input
@@ -126,9 +129,10 @@ public class CJKWidthCharFilter extends BaseCharFilter {
private int combineVoiceMark(int ch, int voiceMark) {
assert voiceMark == HW_KATAKANA_SEMI_VOICED_MARK || voiceMark == HW_KATAKANA_VOICED_MARK;
if (ch >= 0x30A6 && ch <= 0x30FD) {
- ch += (voiceMark == HW_KATAKANA_SEMI_VOICED_MARK)
- ? KANA_COMBINE_SEMI_VOICED[prevChar - 0x30A6]
- : KANA_COMBINE_VOICED[prevChar - 0x30A6];
+ ch +=
+ (voiceMark == HW_KATAKANA_SEMI_VOICED_MARK)
+ ? KANA_COMBINE_SEMI_VOICED[prevChar - 0x30A6]
+ : KANA_COMBINE_VOICED[prevChar - 0x30A6];
}
return ch;
}
@@ -136,7 +140,7 @@ public class CJKWidthCharFilter extends BaseCharFilter {
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
- for(int i = off; i < off + len; i++) {
+ for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
@@ -144,5 +148,4 @@ public class CJKWidthCharFilter extends BaseCharFilter {
}
return numRead == 0 ? -1 : numRead;
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
index 4f8bf09..bec98c9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.cjk;
-
-import org.apache.lucene.analysis.CharFilterFactory;
-
import java.io.Reader;
import java.util.Map;
+import org.apache.lucene.analysis.CharFilterFactory;
/**
* Factory for {@link CJKWidthCharFilter}.
+ *
* @lucene.spi {@value #NAME}
*/
public class CJKWidthCharFilterFactory extends CharFilterFactory {
@@ -32,7 +31,7 @@ public class CJKWidthCharFilterFactory extends CharFilterFactory {
public static final String NAME = "cjkWidth";
/** Creates a new CJKWidthCharFilterFactory */
- public CJKWidthCharFilterFactory(Map<String,String> args) {
+ public CJKWidthCharFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -53,5 +52,4 @@ public class CJKWidthCharFilterFactory extends CharFilterFactory {
public Reader normalize(Reader input) {
return create(input);
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
index 485ac63..9997576 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
@@ -16,9 +16,7 @@
*/
package org.apache.lucene.analysis.cjk;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -26,33 +24,34 @@ import org.apache.lucene.analysis.util.StemmerUtil;
/**
* A {@link TokenFilter} that normalizes CJK width differences:
+ *
* <ul>
* <li>Folds fullwidth ASCII variants into the equivalent basic latin
* <li>Folds halfwidth Katakana variants into the equivalent kana
* </ul>
- * <p>
- * NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
- * Unicode normalization. See the normalization support in the ICU package
- * for full normalization.
+ *
+ * <p>NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD Unicode normalization.
+ * See the normalization support in the ICU package for full normalization.
*/
public final class CJKWidthFilter extends TokenFilter {
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
- /* halfwidth kana mappings: 0xFF65-0xFF9D
+
+ /* halfwidth kana mappings: 0xFF65-0xFF9D
*
* note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
- * as a fallback when they cannot properly combine with a preceding
+ * as a fallback when they cannot properly combine with a preceding
* character into a composed form.
*/
- private static final char KANA_NORM[] = new char[] {
- 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
- 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
- 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
- 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
- 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
- 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
- 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
- };
+ private static final char KANA_NORM[] =
+ new char[] {
+ 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
+ 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
+ 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+ 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
+ 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
+ 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
+ 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
+ };
public CJKWidthFilter(TokenStream input) {
super(input);
@@ -85,28 +84,30 @@ public final class CJKWidthFilter extends TokenFilter {
}
/* kana combining diffs: 0x30A6-0x30FD */
- private static final byte KANA_COMBINE_VOICED[] = new byte[] {
- 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
- 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
- };
-
- private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
- 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
-
+ private static final byte KANA_COMBINE_VOICED[] =
+ new byte[] {
+ 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
+ 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+ };
+
+ private static final byte KANA_COMBINE_HALF_VOICED[] =
+ new byte[] {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
+ 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
/** returns true if we successfully combined the voice mark */
private static boolean combine(char text[], int pos, char ch) {
- final char prev = text[pos-1];
+ final char prev = text[pos - 1];
if (prev >= 0x30A6 && prev <= 0x30FD) {
- text[pos-1] += (ch == 0xFF9F)
- ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6]
- : KANA_COMBINE_VOICED[prev - 0x30A6];
- return text[pos-1] != prev;
+ text[pos - 1] +=
+ (ch == 0xFF9F)
+ ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6]
+ : KANA_COMBINE_VOICED[prev - 0x30A6];
+ return text[pos - 1] != prev;
}
return false;
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
index 3e522e3..8e464fd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.cjk;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
-/**
+/**
* Factory for {@link CJKWidthFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_cjk" class="solr.TextField">
* <analyzer>
@@ -33,6 +32,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.CJKBigramFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 3.6.0
* @lucene.spi {@value #NAME}
*/
@@ -40,15 +40,15 @@ public class CJKWidthFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "cjkWidth";
-
+
/** Creates a new CJKWidthFilterFactory */
- public CJKWidthFilterFactory(Map<String,String> args) {
+ public CJKWidthFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public CJKWidthFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/package-info.java
index 2fd4ad4..77e859c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/package-info.java
@@ -16,21 +16,28 @@
*/
/**
- * Analyzer for Chinese, Japanese, and Korean, which indexes bigrams.
- * This analyzer generates bigram terms, which are overlapping groups of two adjacent Han, Hiragana, Katakana, or Hangul characters.
- * <p>
- * Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+ * Analyzer for Chinese, Japanese, and Korean, which indexes bigrams. This analyzer generates bigram
+ * terms, which are overlapping groups of two adjacent Han, Hiragana, Katakana, or Hangul
+ * characters.
+ *
+ * <p>Three analyzers are provided for Chinese, each of which treats Chinese text in a different
+ * way.
+ *
* <ul>
- * <li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
- * <li>CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
- * <li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
+ * <li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese
+ * characters) as a token.
+ * <li>CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese
+ * characters) as tokens.
+ * <li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment
+ * Chinese text into words) as tokens.
* </ul>
- *
+ *
* Example phrase: "我是中国人"
+ *
* <ol>
- * <li>ChineseAnalyzer: 我-是-中-国-人</li>
- * <li>CJKAnalyzer: 我是-是中-中国-国人</li>
- * <li>SmartChineseAnalyzer: 我-是-中国-人</li>
+ * <li>ChineseAnalyzer: 我-是-中-国-人
+ * <li>CJKAnalyzer: 我是-是中-中国-国人
+ * <li>SmartChineseAnalyzer: 我-是-中国-人
* </ol>
*/
package org.apache.lucene.analysis.cjk;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
index 4b70886..e76d8c7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@@ -16,11 +16,9 @@
*/
package org.apache.lucene.analysis.ckb;
-
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -41,29 +39,32 @@ import org.apache.lucene.util.IOUtils;
*/
public final class SoraniAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
-
+
/** File containing default Kurdish stopwords. */
- public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
-
+ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
/**
* Returns an unmodifiable instance of the default stop words set.
+ *
* @return default stop words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
-
+
/**
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
- * accesses the static final set the first time.;
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+ * static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
- DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
+ DEFAULT_STOP_SET =
+ WordlistLoader.getWordSet(
+ IOUtils.getDecodingReader(
+ SoraniAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
@@ -72,16 +73,14 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
}
}
- /**
- * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
- */
+ /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public SoraniAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
-
+
/**
* Builds an analyzer with the given stop words.
- *
+ *
* @param stopwords a stopword set
*/
public SoraniAnalyzer(CharArraySet stopwords) {
@@ -89,10 +88,9 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
- * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
- * stemming.
- *
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
+ * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming.
+ *
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
@@ -102,17 +100,13 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Creates a
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A
- * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from an {@link StandardTokenizer} filtered with
- * {@link SoraniNormalizationFilter},
- * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
- * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link SoraniStemFilter}.
+ * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
+ * the text in the provided {@link Reader}.
+ *
+ * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
+ * {@link StandardTokenizer} filtered with {@link SoraniNormalizationFilter}, {@link
+ * LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}, {@link
+ * SetKeywordMarkerFilter} if a stem exclusion set is provided and {@link SoraniStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
@@ -121,8 +115,7 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
result = new StopFilter(result, stopwords);
- if(!stemExclusionSet.isEmpty())
- result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SoraniStemFilter(result);
return new TokenStreamComponents(source, result);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
index 6819b5b..a286d24 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
@@ -16,17 +16,12 @@
*/
package org.apache.lucene.analysis.ckb;
-
import java.io.IOException;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-/**
- * A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
- * orthography.
- */
+/** A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the orthography. */
public final class SoraniNormalizationFilter extends TokenFilter {
private final SoraniNormalizer normalizer = new SoraniNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -41,7 +36,7 @@ public final class SoraniNormalizationFilter extends TokenFilter {
final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
return true;
- }
+ }
return false;
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
index 26a9a68..59409f7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.ckb;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link SoraniNormalizationFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -31,6 +30,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.SoraniNormalizationFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 4.7.0
* @lucene.spi {@value #NAME}
*/
@@ -40,7 +40,7 @@ public class SoraniNormalizationFilterFactory extends TokenFilterFactory {
public static final String NAME = "soraniNormalization";
/** Creates a new SoraniNormalizationFilterFactory */
- public SoraniNormalizationFilterFactory(Map<String,String> args) {
+ public SoraniNormalizationFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
index bb9187c..228909d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
@@ -18,38 +18,40 @@ package org.apache.lucene.analysis.ckb;
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
-/**
+/**
* Normalizes the Unicode representation of Sorani text.
- * <p>
- * Normalization consists of:
+ *
+ * <p>Normalization consists of:
+ *
* <ul>
* <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
* <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
* <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
* <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
- * <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
+ * <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V
+ * BELOW)
* <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
* </ul>
*/
public class SoraniNormalizer {
-
+
static final char YEH = '\u064A';
static final char DOTLESS_YEH = '\u0649';
static final char FARSI_YEH = '\u06CC';
-
+
static final char KAF = '\u0643';
static final char KEHEH = '\u06A9';
-
+
static final char HEH = '\u0647';
static final char AE = '\u06D5';
static final char ZWNJ = '\u200C';
static final char HEH_DOACHASHMEE = '\u06BE';
static final char TEH_MARBUTA = '\u0629';
-
+
static final char REH = '\u0631';
static final char RREH = '\u0695';
static final char RREH_ABOVE = '\u0692';
-
+
static final char TATWEEL = '\u0640';
static final char FATHATAN = '\u064B';
static final char DAMMATAN = '\u064C';
@@ -62,7 +64,7 @@ public class SoraniNormalizer {
/**
* Normalize an input buffer of Sorani text
- *
+ *
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
@@ -78,14 +80,14 @@ public class SoraniNormalizer {
s[i] = KEHEH;
break;
case ZWNJ:
- if (i > 0 && s[i-1] == HEH) {
- s[i-1] = AE;
+ if (i > 0 && s[i - 1] == HEH) {
+ s[i - 1] = AE;
}
len = delete(s, i, len);
i--;
break;
case HEH:
- if (i == len-1) {
+ if (i == len - 1) {
s[i] = AE;
}
break;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
index 0d7a207..962cf2d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
@@ -16,29 +16,27 @@
*/
package org.apache.lucene.analysis.ckb;
-
import java.io.IOException;
-
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
- * <p>
- * To prevent terms from being stemmed use an instance of
- * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
- * the {@link KeywordAttribute} before this {@link TokenStream}.
- * </p>
- * @see SetKeywordMarkerFilter */
-
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
+ * @see SetKeywordMarkerFilter
+ */
public final class SoraniStemFilter extends TokenFilter {
private final SoraniStemmer stemmer = new SoraniStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
-
+
public SoraniStemFilter(TokenStream input) {
super(input);
}
@@ -46,7 +44,7 @@ public final class SoraniStemFilter extends TokenFilter {
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- if(!keywordAttr.isKeyword()) {
+ if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
index ca12bc9..00ce270 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
@@ -16,14 +16,13 @@
*/
package org.apache.lucene.analysis.ckb;
-
import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link SoraniStemFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -32,6 +31,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.SoraniStemFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 4.7.0
* @lucene.spi {@value #NAME}
*/
@@ -41,7 +41,7 @@ public class SoraniStemFilterFactory extends TokenFilterFactory {
public static final String NAME = "soraniStem";
/** Creates a new SoraniStemFilterFactory */
- public SoraniStemFilterFactory(Map<String,String> args) {
+ public SoraniStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
index 70c5e7f..b7a8df5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
@@ -16,17 +16,14 @@
*/
package org.apache.lucene.analysis.ckb;
-
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
-/**
- * Light stemmer for Sorani
- */
+/** Light stemmer for Sorani */
public class SoraniStemmer {
-
+
/**
* Stem an input buffer of Sorani text.
- *
+ *
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
@@ -40,63 +37,64 @@ public class SoraniStemmer {
} else if (len > 6 && endsWith(s, len, "ەوە")) {
len -= 3;
}
-
+
// possessive pronoun
- if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
+ if (len > 6
+ && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
len -= 3;
}
-
+
// indefinite singular ezafe
if (len > 6 && endsWith(s, len, "ێکی")) {
- return len-3;
+ return len - 3;
} else if (len > 7 && endsWith(s, len, "یەکی")) {
- return len-4;
+ return len - 4;
}
// indefinite singular
if (len > 5 && endsWith(s, len, "ێک")) {
- return len-2;
+ return len - 2;
} else if (len > 6 && endsWith(s, len, "یەک")) {
- return len-3;
+ return len - 3;
}
// definite singular
else if (len > 6 && endsWith(s, len, "ەکە")) {
- return len-3;
+ return len - 3;
} else if (len > 5 && endsWith(s, len, "کە")) {
- return len-2;
+ return len - 2;
}
// definite plural
else if (len > 7 && endsWith(s, len, "ەکان")) {
- return len-4;
+ return len - 4;
} else if (len > 6 && endsWith(s, len, "کان")) {
- return len-3;
+ return len - 3;
}
// indefinite plural ezafe
else if (len > 7 && endsWith(s, len, "یانی")) {
- return len-4;
+ return len - 4;
} else if (len > 6 && endsWith(s, len, "انی")) {
- return len-3;
+ return len - 3;
}
// indefinite plural
else if (len > 6 && endsWith(s, len, "یان")) {
- return len-3;
+ return len - 3;
} else if (len > 5 && endsWith(s, len, "ان")) {
- return len-2;
- }
+ return len - 2;
+ }
// demonstrative plural
else if (len > 7 && endsWith(s, len, "یانە")) {
- return len-4;
+ return len - 4;
} else if (len > 6 && endsWith(s, len, "انە")) {
- return len-3;
+ return len - 3;
}
// demonstrative singular
else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
- return len-2;
+ return len - 2;
} else if (len > 4 && endsWith(s, len, "ە")) {
- return len-1;
+ return len - 1;
}
// absolute singular ezafe
else if (len > 4 && endsWith(s, len, "ی")) {
- return len-1;
+ return len - 1;
}
return len;
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package-info.java
index 5fccddf..97878e3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Analyzer for Sorani Kurdish.
- */
+/** Analyzer for Sorani Kurdish. */
package org.apache.lucene.analysis.ckb;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicAnalyzer.java
index f646e1e..e76803cc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicAnalyzer.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.classic;
-
import java.io.IOException;
import java.io.Reader;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
@@ -30,13 +28,11 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
/**
- * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
- * LowerCaseFilter} and {@link StopFilter}, using a list of
- * English stop words.
- *
- * ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
- * As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
- * as specified by UAX#29.
+ * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link LowerCaseFilter} and {@link
+ * StopFilter}, using a list of English stop words.
+ *
+ * <p>ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1. As of 3.1, {@link
+ * StandardAnalyzer} implements Unicode text segmentation, as specified by UAX#29.
*
* @since 3.1
*/
@@ -47,43 +43,45 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
- /** An unmodifiable set containing some common English words that are usually not
- useful for searching. */
+ /**
+ * An unmodifiable set containing some common English words that are usually not useful for
+ * searching.
+ */
public static final CharArraySet STOP_WORDS_SET = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
- /** Builds an analyzer with the given stop words.
- * @param stopWords stop words */
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopWords stop words
+ */
public ClassicAnalyzer(CharArraySet stopWords) {
super(stopWords);
}
- /** Builds an analyzer with the default stop words ({@link
- * #STOP_WORDS_SET}).
- */
+ /** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). */
public ClassicAnalyzer() {
this(STOP_WORDS_SET);
}
- /** Builds an analyzer with the stop words from the given reader.
+ /**
+ * Builds an analyzer with the stop words from the given reader.
+ *
* @see WordlistLoader#getWordSet(Reader)
- * @param stopwords Reader to read stop words from */
+ * @param stopwords Reader to read stop words from
+ */
public ClassicAnalyzer(Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords));
}
/**
- * Set maximum allowed token length. If a token is seen
- * that exceeds this length then it is discarded. This
- * setting only takes effect the next time tokenStream or
- * tokenStream is called.
+ * Set maximum allowed token length. If a token is seen that exceeds this length then it is
+ * discarded. This setting only takes effect the next time tokenStream or tokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
-
- /**
- * @see #setMaxTokenLength
- */
+
+ /** @see #setMaxTokenLength */
public int getMaxTokenLength() {
return maxTokenLength;
}
@@ -95,10 +93,12 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
TokenStream tok = new ClassicFilter(src);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
- return new TokenStreamComponents(r -> {
- src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
- src.setReader(r);
- }, tok);
+ return new TokenStreamComponents(
+ r -> {
+ src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
+ src.setReader(r);
+ },
+ tok);
}
@Override
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilter.java
index 9de1235..213e052 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilter.java
@@ -16,14 +16,12 @@
*/
package org.apache.lucene.analysis.classic;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** Normalizes tokens extracted with {@link ClassicTokenizer}. */
-
public class ClassicFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
@@ -31,15 +29,19 @@ public class ClassicFilter extends TokenFilter {
super(in);
}
- private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static final String APOSTROPHE_TYPE =
+ ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
// this filters uses attribute type
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
- /** Returns the next token in the stream, or null at EOS.
+
+ /**
+ * Returns the next token in the stream, or null at EOS.
+ *
* <p>Removes <code>'s</code> from the end of words.
+ *
* <p>Removes dots from acronyms.
*/
@Override
@@ -52,18 +54,18 @@ public class ClassicFilter extends TokenFilter {
final int bufferLength = termAtt.length();
final String type = typeAtt.type();
- if (type == APOSTROPHE_TYPE && // remove 's
- bufferLength >= 2 &&
- buffer[bufferLength-2] == '\'' &&
- (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+ // remove 's
+ if (type == APOSTROPHE_TYPE
+ && bufferLength >= 2
+ && buffer[bufferLength - 2] == '\''
+ && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) {
// Strip last 2 characters off
termAtt.setLength(bufferLength - 2);
- } else if (type == ACRONYM_TYPE) { // remove dots
+ } else if (type == ACRONYM_TYPE) { // remove dots
int upto = 0;
- for(int i=0;i<bufferLength;i++) {
+ for (int i = 0; i < bufferLength; i++) {
char c = buffer[i];
- if (c != '.')
- buffer[upto++] = c;
+ if (c != '.') buffer[upto++] = c;
}
termAtt.setLength(upto);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilterFactory.java
index 75d4865..4c153bf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicFilterFactory.java
@@ -16,15 +16,14 @@
*/
package org.apache.lucene.analysis.classic;
-
import java.util.Map;
-
import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link ClassicFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -32,6 +31,7 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* <filter class="solr.ClassicFilterFactory"/>
* </analyzer>
* </fieldType></pre>
+ *
* @since 3.1.0
* @lucene.spi {@value #NAME}
*/
@@ -39,15 +39,15 @@ public class ClassicFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "classic";
-
+
/** Creates a new ClassicFilterFactory */
- public ClassicFilterFactory(Map<String,String> args) {
+ public ClassicFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public ClassicFilterFactory() {
throw defaultCtorException();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizer.java
index 261d843..50f5e43 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizer.java
@@ -18,71 +18,69 @@
package org.apache.lucene.analysis.classic;
import java.io.IOException;
-
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
-/** A grammar-based tokenizer constructed with JFlex
+/**
+ * A grammar-based tokenizer constructed with JFlex
*
- * <p> This should be a good tokenizer for most European-language documents:
+ * <p>This should be a good tokenizer for most European-language documents:
*
* <ul>
- * <li>Splits words at punctuation characters, removing punctuation. However, a
- * dot that's not followed by whitespace is considered part of a token.
- * <li>Splits words at hyphens, unless there's a number in the token, in which case
- * the whole token is interpreted as a product number and is not split.
+ * <li>Splits words at punctuation characters, removing punctuation. However, a dot that's not
+ * followed by whitespace is considered part of a token.
+ * <li>Splits words at hyphens, unless there's a number in the token, in which case the whole
+ * token is interpreted as a product number and is not split.
* <li>Recognizes email addresses and internet hostnames as one token.
* </ul>
*
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does not suit your
+ * application, please consider copying this source code directory to your project and maintaining
+ * your own grammar-based tokenizer.
*
- * ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
- * As of 3.1, {@link StandardTokenizer} implements Unicode text segmentation,
- * as specified by UAX#29.
+ * <p>ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1. As of 3.1,
+ * {@link StandardTokenizer} implements Unicode text segmentation, as specified by UAX#29.
*/
-
public final class ClassicTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private ClassicTokenizerImpl scanner;
- public static final int ALPHANUM = 0;
- public static final int APOSTROPHE = 1;
- public static final int ACRONYM = 2;
- public static final int COMPANY = 3;
- public static final int EMAIL = 4;
- public static final int HOST = 5;
- public static final int NUM = 6;
- public static final int CJ = 7;
+ public static final int ALPHANUM = 0;
+ public static final int APOSTROPHE = 1;
+ public static final int ACRONYM = 2;
+ public static final int COMPANY = 3;
+ public static final int EMAIL = 4;
+ public static final int HOST = 5;
+ public static final int NUM = 6;
+ public static final int CJ = 7;
- public static final int ACRONYM_DEP = 8;
+ public static final int ACRONYM_DEP = 8;
/** String token types that correspond to token type int constants */
- public static final String [] TOKEN_TYPES = new String [] {
- "<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
- "<NUM>",
- "<CJ>",
- "<ACRONYM_DEP>"
- };
-
+ public static final String[] TOKEN_TYPES =
+ new String[] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>"
+ };
+
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
- /** Set the max allowed token length. Any token longer
- * than this is skipped. */
+ /** Set the max allowed token length. Any token longer than this is skipped. */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
@@ -96,18 +94,16 @@ public final class ClassicTokenizer extends Tokenizer {
}
/**
- * Creates a new instance of the {@link ClassicTokenizer}. Attaches
- * the <code>input</code> to the newly created JFlex scanner.
+ * Creates a new instance of the {@link ClassicTokenizer}. Attaches the <code>input</code> to the
+ * newly created JFlex scanner.
*
- * See http://issues.apache.org/jira/browse/LUCENE-1068
+ * <p>See http://issues.apache.org/jira/browse/LUCENE-1068
*/
public ClassicTokenizer() {
init();
}
- /**
- * Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
- */
+ /** Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} */
public ClassicTokenizer(AttributeFactory factory) {
super(factory);
init();
@@ -121,7 +117,8 @@ public final class ClassicTokenizer extends Tokenizer {
// term offset, positionIncrement and type
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt =
+ addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
/*
@@ -134,7 +131,7 @@ public final class ClassicTokenizer extends Tokenizer {
clearAttributes();
skippedPositions = 0;
- while(true) {
+ while (true) {
int tokenType = scanner.getNextToken();
if (tokenType == ClassicTokenizerImpl.YYEOF) {
@@ -142,10 +139,10 @@ public final class ClassicTokenizer extends Tokenizer {
}
if (scanner.yylength() <= maxTokenLength) {
- posIncrAtt.setPositionIncrement(skippedPositions+1);
+ posIncrAtt.setPositionIncrement(skippedPositions + 1);
scanner.getText(termAtt);
final int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
if (tokenType == ClassicTokenizer.ACRONYM_DEP) {
typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]);
@@ -160,7 +157,7 @@ public final class ClassicTokenizer extends Tokenizer {
skippedPositions++;
}
}
-
+
@Override
public final void end() throws IOException {
super.end();
@@ -168,9 +165,9 @@ public final class ClassicTokenizer extends Tokenizer {
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
// adjust any skipped tokens
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
-
+
@Override
public void close() throws IOException {
super.close();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerFactory.java
index 42ded7c..d62dbf9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerFactory.java
@@ -16,15 +16,14 @@
*/
package org.apache.lucene.analysis.classic;
-
+import java.util.Map;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.AttributeFactory;
-import java.util.Map;
-
/**
* Factory for {@link ClassicTokenizer}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -43,7 +42,7 @@ public class ClassicTokenizerFactory extends TokenizerFactory {
private final int maxTokenLength;
/** Creates a new ClassicTokenizerFactory */
- public ClassicTokenizerFactory(Map<String,String> args) {
+ public ClassicTokenizerFactory(Map<String, String> args) {
super(args);
maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
if (!args.isEmpty()) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java
index 163027b..9f526a9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java
@@ -21,11 +21,8 @@ package org.apache.lucene.analysis.classic;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-/**
- * This class implements the classic lucene StandardTokenizer up until 3.0
- */
+/** This class implements the classic lucene StandardTokenizer up until 3.0 */
@SuppressWarnings("fallthrough")
-
class ClassicTokenizerImpl {
/** This character denotes the end of file */
@@ -38,136 +35,125 @@ class ClassicTokenizerImpl {
public static final int YYINITIAL = 0;
/**
- * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
- * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
- * at the beginning of a line
- * l is of the form l = 2*k, k a non negative integer
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l ZZ_LEXSTATE[l+1] is the state in
+ * the DFA for the lexical state l at the beginning of a line l is of the form l = 2*k, k a non
+ * negative integer
*/
- private static final int ZZ_LEXSTATE[] = {
- 0, 0
- };
-
- /**
- * Translates characters to character classes
- */
- private static final String ZZ_CMAP_PACKED =
- "\46\0\1\5\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0"+
- "\1\6\32\12\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12"+
- "\4\0\1\12\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12"+
- "\34\0\136\12\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12"+
- "\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12"+
- "\1\0\24\12\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12"+
- "\12\0\71\12\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12"+
- "\67\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
- "\56\0\32\12\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12"+
- "\17\0\2\12\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0"+
- "\46\12\u015f\0\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0"+
- "\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0"+
- "\1\12\3\0\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12"+
- "\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12"+
- "\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2"+
- "\2\0\3\12\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12"+
- "\1\0\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12"+
- "\17\0\1\12\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12"+
- "\1\0\7\12\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12"+
- "\1\0\3\12\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12"+
- "\3\0\2\12\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12"+
- "\3\0\10\12\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12"+
- "\1\0\27\12\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2"+
- "\25\0\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12"+
- "\44\0\1\12\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12"+
- "\1\0\27\12\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12"+
- "\3\0\30\12\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1"+
- "\60\12\1\1\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0"+
- "\1\12\2\0\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0"+
- "\7\12\1\0\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0"+
- "\4\12\1\0\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0"+
- "\12\2\2\0\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0"+
- "\42\12\35\0\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0"+
- "\12\2\6\0\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0"+
- "\104\12\5\0\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0"+
- "\4\12\2\0\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0"+
- "\1\12\1\0\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0"+
- "\7\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0"+
- "\27\12\1\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
- "\47\12\1\0\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0"+
- "\10\12\12\0\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0"+
- "\12\2\6\0\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0"+
- "\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
- "\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
- "\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
- "\6\12\4\0\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0"+
- "\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0"+
- "\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0"+
- "\7\12\u0ecb\0\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13"+
- "\2\13\132\13\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0"+
- "\30\12\70\0\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13"+
- "\132\13\u048d\12\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12"+
- "\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
- "\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
- "\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
- "\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
- "\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
-
- /**
- * Translates characters to character classes
- */
- private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
-
- /**
- * Translates DFA states to action switch labels.
- */
- private static final int [] ZZ_ACTION = zzUnpackAction();
+ private static final int ZZ_LEXSTATE[] = {0, 0};
+
+ /** Translates characters to character classes */
+ private static final String ZZ_CMAP_PACKED =
+ "\46\0\1\5\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0"
+ + "\1\6\32\12\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12"
+ + "\4\0\1\12\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12"
+ + "\34\0\136\12\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12"
+ + "\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12"
+ + "\1\0\24\12\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12"
+ + "\12\0\71\12\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12"
+ + "\67\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"
+ + "\56\0\32\12\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12"
+ + "\17\0\2\12\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0"
+ + "\46\12\u015f\0\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0"
+ + "\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0"
+ + "\1\12\3\0\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12"
+ + "\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12"
+ + "\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2"
+ + "\2\0\3\12\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12"
+ + "\1\0\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12"
+ + "\17\0\1\12\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12"
+ + "\1\0\7\12\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12"
+ + "\1\0\3\12\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12"
+ + "\3\0\2\12\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12"
+ + "\3\0\10\12\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12"
+ + "\1\0\27\12\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2"
+ + "\25\0\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12"
+ + "\44\0\1\12\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12"
+ + "\1\0\27\12\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12"
+ + "\3\0\30\12\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1"
+ + "\60\12\1\1\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0"
+ + "\1\12\2\0\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0"
+ + "\7\12\1\0\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0"
+ + "\4\12\1\0\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0"
+ + "\12\2\2\0\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0"
+ + "\42\12\35\0\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0"
+ + "\12\2\6\0\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0"
+ + "\104\12\5\0\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0"
+ + "\4\12\2\0\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0"
+ + "\1\12\1\0\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0"
+ + "\7\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0"
+ + "\27\12\1\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"
+ + "\47\12\1\0\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0"
+ + "\10\12\12\0\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0"
+ + "\12\2\6\0\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0"
+ + "\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"
+ + "\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"
+ + "\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"
+ + "\6\12\4\0\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0"
+ + "\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0"
+ + "\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0"
+ + "\7\12\u0ecb\0\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13"
+ + "\2\13\132\13\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0"
+ + "\30\12\70\0\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13"
+ + "\132\13\u048d\12\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12"
+ + "\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"
+ + "\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"
+ + "\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"
+ + "\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"
+ + "\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
+
+ /** Translates characters to character classes */
+ private static final char[] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /** Translates DFA states to action switch labels. */
+ private static final int[] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
- "\1\0\1\1\3\2\1\3\13\0\1\2\3\4\2\0"+
- "\1\5\1\0\1\5\3\4\6\5\1\6\1\4\2\7"+
- "\1\10\1\0\1\10\3\0\2\10\1\11\1\12\1\4";
+ "\1\0\1\1\3\2\1\3\13\0\1\2\3\4\2\0"
+ + "\1\5\1\0\1\5\3\4\6\5\1\6\1\4\2\7"
+ + "\1\10\1\0\1\10\3\0\2\10\1\11\1\12\1\4";
- private static int [] zzUnpackAction() {
- int [] result = new int[50];
+ private static int[] zzUnpackAction() {
+ int[] result = new int[50];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
}
- private static int zzUnpackAction(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
+ private static int zzUnpackAction(String packed, int offset, int[] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
- do result[j++] = value; while (--count > 0);
+ do result[j++] = value;
+ while (--count > 0);
}
return j;
}
-
- /**
- * Translates a state to a row index in the transition table
- */
- private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+ /** Translates a state to a row index in the transition table */
+ private static final int[] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
- "\0\0\0\14\0\30\0\44\0\60\0\14\0\74\0\110"+
- "\0\124\0\140\0\154\0\170\0\204\0\220\0\234\0\250"+
- "\0\264\0\300\0\314\0\330\0\344\0\360\0\374\0\u0108"+
- "\0\u0114\0\u0120\0\u012c\0\u0138\0\u0144\0\u0150\0\u015c\0\u0168"+
- "\0\u0174\0\u0180\0\u018c\0\u0198\0\u01a4\0\250\0\u01b0\0\u01bc"+
- "\0\u01c8\0\u01d4\0\u01e0\0\u01ec\0\u01f8\0\74\0\154\0\u0204"+
- "\0\u0210\0\u021c";
-
- private static int [] zzUnpackRowMap() {
- int [] result = new int[50];
+ "\0\0\0\14\0\30\0\44\0\60\0\14\0\74\0\110"
+ + "\0\124\0\140\0\154\0\170\0\204\0\220\0\234\0\250"
+ + "\0\264\0\300\0\314\0\330\0\344\0\360\0\374\0\u0108"
+ + "\0\u0114\0\u0120\0\u012c\0\u0138\0\u0144\0\u0150\0\u015c\0\u0168"
+ + "\0\u0174\0\u0180\0\u018c\0\u0198\0\u01a4\0\250\0\u01b0\0\u01bc"
+ + "\0\u01c8\0\u01d4\0\u01e0\0\u01ec\0\u01f8\0\74\0\154\0\u0204"
+ + "\0\u0210\0\u021c";
+
+ private static int[] zzUnpackRowMap() {
+ int[] result = new int[50];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
}
- private static int zzUnpackRowMap(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
+ private static int zzUnpackRowMap(String packed, int offset, int[] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int high = packed.charAt(i++) << 16;
@@ -176,74 +162,72 @@ class ClassicTokenizerImpl {
return j;
}
- /**
- * The transition table of the DFA
- */
- private static final int [] ZZ_TRANS = zzUnpackTrans();
+ /** The transition table of the DFA */
+ private static final int[] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
- "\1\2\1\3\1\4\7\2\1\5\1\6\15\0\2\3"+
- "\1\0\1\7\1\0\1\10\2\11\1\12\1\3\2\0"+
- "\1\3\1\4\1\0\1\13\1\0\1\10\2\14\1\15"+
- "\1\4\2\0\1\3\1\4\1\16\1\17\1\20\1\21"+
- "\2\11\1\12\1\22\2\0\1\23\1\24\7\0\1\25"+
- "\2\0\2\26\7\0\1\26\2\0\1\27\1\30\7\0"+
- "\1\31\3\0\1\32\7\0\1\12\2\0\1\33\1\34"+
- "\7\0\1\35\2\0\1\36\1\37\7\0\1\40\2\0"+
- "\1\41\1\42\7\0\1\43\13\0\1\44\2\0\1\23"+
- "\1\24\7\0\1\45\13\0\1\46\2\0\2\26\7\0"+
- "\1\47\2\0\1\3\1\4\1\16\1\7\1\20\1\21"+
- "\2\11\1\12\1\22\2\0\2\23\1\0\1\50\1\0"+
- "\1\10\2\51\1\0\1\23\2\0\1\23\1\24\1\0"+
- "\1\52\1\0\1\10\2\53\1\54\1\24\2\0\1\23"+
- "\1\24\1\0\1\50\1\0\1\10\2\51\1\0\1\25"+
- "\2\0\2\26\1\0\1\55\2\0\1\55\2\0\1\26"+
- "\2\0\2\27\1\0\1\51\1\0\1\10\2\51\1\0"+
- "\1\27\2\0\1\27\1\30\1\0\1\53\1\0\1\10"+
- "\2\53\1\54\1\30\2\0\1\27\1\30\1\0\1\51"+
- "\1\0\1\10\2\51\1\0\1\31\3\0\1\32\1\0"+
- "\1\54\2\0\3\54\1\32\2\0\2\33\1\0\1\56"+
- "\1\0\1\10\2\11\1\12\1\33\2\0\1\33\1\34"+
- "\1\0\1\57\1\0\1\10\2\14\1\15\1\34\2\0"+
- "\1\33\1\34\1\0\1\56\1\0\1\10\2\11\1\12"+
- "\1\35\2\0\2\36\1\0\1\11\1\0\1\10\2\11"+
- "\1\12\1\36\2\0\1\36\1\37\1\0\1\14\1\0"+
- "\1\10\2\14\1\15\1\37\2\0\1\36\1\37\1\0"+
- "\1\11\1\0\1\10\2\11\1\12\1\40\2\0\2\41"+
- "\1\0\1\12\2\0\3\12\1\41\2\0\1\41\1\42"+
- "\1\0\1\15\2\0\3\15\1\42\2\0\1\41\1\42"+
- "\1\0\1\12\2\0\3\12\1\43\4\0\1\16\6\0"+
- "\1\44\2\0\1\23\1\24\1\0\1\60\1\0\1\10"+
- "\2\51\1\0\1\25\2\0\2\26\1\0\1\55\2\0"+
- "\1\55\2\0\1\47\2\0\2\23\7\0\1\23\2\0"+
- "\2\27\7\0\1\27\2\0\2\33\7\0\1\33\2\0"+
- "\2\36\7\0\1\36\2\0\2\41\7\0\1\41\2\0"+
- "\2\61\7\0\1\61\2\0\2\23\7\0\1\62\2\0"+
- "\2\61\1\0\1\55\2\0\1\55\2\0\1\61\2\0"+
- "\2\23\1\0\1\60\1\0\1\10\2\51\1\0\1\23"+
- "\1\0";
-
- private static int [] zzUnpackTrans() {
- int [] result = new int[552];
+ "\1\2\1\3\1\4\7\2\1\5\1\6\15\0\2\3"
+ + "\1\0\1\7\1\0\1\10\2\11\1\12\1\3\2\0"
+ + "\1\3\1\4\1\0\1\13\1\0\1\10\2\14\1\15"
+ + "\1\4\2\0\1\3\1\4\1\16\1\17\1\20\1\21"
+ + "\2\11\1\12\1\22\2\0\1\23\1\24\7\0\1\25"
+ + "\2\0\2\26\7\0\1\26\2\0\1\27\1\30\7\0"
+ + "\1\31\3\0\1\32\7\0\1\12\2\0\1\33\1\34"
+ + "\7\0\1\35\2\0\1\36\1\37\7\0\1\40\2\0"
+ + "\1\41\1\42\7\0\1\43\13\0\1\44\2\0\1\23"
+ + "\1\24\7\0\1\45\13\0\1\46\2\0\2\26\7\0"
+ + "\1\47\2\0\1\3\1\4\1\16\1\7\1\20\1\21"
+ + "\2\11\1\12\1\22\2\0\2\23\1\0\1\50\1\0"
+ + "\1\10\2\51\1\0\1\23\2\0\1\23\1\24\1\0"
+ + "\1\52\1\0\1\10\2\53\1\54\1\24\2\0\1\23"
+ + "\1\24\1\0\1\50\1\0\1\10\2\51\1\0\1\25"
+ + "\2\0\2\26\1\0\1\55\2\0\1\55\2\0\1\26"
+ + "\2\0\2\27\1\0\1\51\1\0\1\10\2\51\1\0"
+ + "\1\27\2\0\1\27\1\30\1\0\1\53\1\0\1\10"
+ + "\2\53\1\54\1\30\2\0\1\27\1\30\1\0\1\51"
+ + "\1\0\1\10\2\51\1\0\1\31\3\0\1\32\1\0"
+ + "\1\54\2\0\3\54\1\32\2\0\2\33\1\0\1\56"
+ + "\1\0\1\10\2\11\1\12\1\33\2\0\1\33\1\34"
+ + "\1\0\1\57\1\0\1\10\2\14\1\15\1\34\2\0"
+ + "\1\33\1\34\1\0\1\56\1\0\1\10\2\11\1\12"
+ + "\1\35\2\0\2\36\1\0\1\11\1\0\1\10\2\11"
+ + "\1\12\1\36\2\0\1\36\1\37\1\0\1\14\1\0"
+ + "\1\10\2\14\1\15\1\37\2\0\1\36\1\37\1\0"
+ + "\1\11\1\0\1\10\2\11\1\12\1\40\2\0\2\41"
+ + "\1\0\1\12\2\0\3\12\1\41\2\0\1\41\1\42"
+ + "\1\0\1\15\2\0\3\15\1\42\2\0\1\41\1\42"
+ + "\1\0\1\12\2\0\3\12\1\43\4\0\1\16\6\0"
+ + "\1\44\2\0\1\23\1\24\1\0\1\60\1\0\1\10"
+ + "\2\51\1\0\1\25\2\0\2\26\1\0\1\55\2\0"
+ + "\1\55\2\0\1\47\2\0\2\23\7\0\1\23\2\0"
+ + "\2\27\7\0\1\27\2\0\2\33\7\0\1\33\2\0"
+ + "\2\36\7\0\1\36\2\0\2\41\7\0\1\41\2\0"
+ + "\2\61\7\0\1\61\2\0\2\23\7\0\1\62\2\0"
+ + "\2\61\1\0\1\55\2\0\1\55\2\0\1\61\2\0"
+ + "\2\23\1\0\1\60\1\0\1\10\2\51\1\0\1\23"
+ + "\1\0";
+
+ private static int[] zzUnpackTrans() {
+ int[] result = new int[552];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
}
- private static int zzUnpackTrans(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
+ private static int zzUnpackTrans(String packed, int offset, int[] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
- do result[j++] = value; while (--count > 0);
+ do result[j++] = value;
+ while (--count > 0);
}
return j;
}
-
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
@@ -256,30 +240,28 @@ class ClassicTokenizerImpl {
"Error: pushback value was too large"
};
- /**
- * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
- */
- private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+ /** ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> */
+ private static final int[] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
- "\1\0\1\11\3\1\1\11\13\0\4\1\2\0\1\1"+
- "\1\0\17\1\1\0\1\1\3\0\5\1";
+ "\1\0\1\11\3\1\1\11\13\0\4\1\2\0\1\1" + "\1\0\17\1\1\0\1\1\3\0\5\1";
- private static int [] zzUnpackAttribute() {
- int [] result = new int[50];
+ private static int[] zzUnpackAttribute() {
+ int[] result = new int[50];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
}
- private static int zzUnpackAttribute(String packed, int offset, int [] result) {
- int i = 0; /* index in packed string */
- int j = offset; /* index in unpacked array */
+ private static int zzUnpackAttribute(String packed, int offset, int[] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
- do result[j++] = value; while (--count > 0);
+ do result[j++] = value;
+ while (--count > 0);
}
return j;
}
@@ -293,8 +275,9 @@ class ClassicTokenizerImpl {
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
- /** this buffer contains the current text to be matched and is
- the source of the yytext() string */
+ /**
+ * this buffer contains the current text to be matched and is the source of the yytext() string
+ */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
@@ -306,8 +289,7 @@ class ClassicTokenizerImpl {
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
- /** endRead marks the last character in the buffer, that has been read
- from input */
+ /** endRead marks the last character in the buffer, that has been read from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
@@ -316,15 +298,10 @@ class ClassicTokenizerImpl {
/** the number of characters up to the start of the matched text */
private int yychar;
- /**
- * the number of characters from the last newline up to the start of the
- * matched text
- */
+ /** the number of characters from the last newline up to the start of the matched text */
private int yycolumn;
- /**
- * zzAtBOL == true iff the scanner is currently at the beginning of a line
- */
+ /** zzAtBOL == true iff the scanner is currently at the beginning of a line */
private boolean zzAtBOL = true;
/** zzAtEOF == true iff the scanner is at the EOF */
@@ -332,81 +309,74 @@ class ClassicTokenizerImpl {
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
-
- /**
- * The number of occupied positions in zzBuffer beyond zzEndRead.
- * When a lead/high surrogate has been read from the input stream
- * into the final zzBuffer position, this will have a value of 1;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead. When a lead/high surrogate has
+ * been read from the input stream into the final zzBuffer position, this will have a value of 1;
* otherwise, it will have a value of 0.
*/
private int zzFinalHighSurrogate = 0;
/* user code: */
-public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
-public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
-public static final int ACRONYM = ClassicTokenizer.ACRONYM;
-public static final int COMPANY = ClassicTokenizer.COMPANY;
-public static final int EMAIL = ClassicTokenizer.EMAIL;
-public static final int HOST = ClassicTokenizer.HOST;
-public static final int NUM = ClassicTokenizer.NUM;
-public static final int CJ = ClassicTokenizer.CJ;
-public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
+ public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
+ public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
+ public static final int ACRONYM = ClassicTokenizer.ACRONYM;
+ public static final int COMPANY = ClassicTokenizer.COMPANY;
+ public static final int EMAIL = ClassicTokenizer.EMAIL;
+ public static final int HOST = ClassicTokenizer.HOST;
+ public static final int NUM = ClassicTokenizer.NUM;
+ public static final int CJ = ClassicTokenizer.CJ;
+ public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
-public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
+ public static final String[] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
-public final int yychar()
-{
+ public final int yychar() {
return yychar;
-}
-
-/**
- * Fills CharTermAttribute with the current token text.
- */
-public final void getText(CharTermAttribute t) {
- t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
-}
+ }
- public final void setBufferSize(int numChars) {
- throw new UnsupportedOperationException();
- }
+ /** Fills CharTermAttribute with the current token text. */
+ public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
/**
* Creates a new scanner
*
- * @param in the java.io.Reader to read input from.
+ * @param in the java.io.Reader to read input from.
*/
ClassicTokenizerImpl(java.io.Reader in) {
this.zzReader = in;
}
-
- /**
+ /**
* Unpacks the compressed character translation table.
*
- * @param packed the packed character translation table
- * @return the unpacked character translation table
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
*/
- private static char [] zzUnpackCMap(String packed) {
- char [] map = new char[0x110000];
- int i = 0; /* index in packed string */
- int j = 0; /* index in unpacked array */
+ private static char[] zzUnpackCMap(String packed) {
+ char[] map = new char[0x110000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
while (i < 1170) {
- int count = packed.charAt(i++);
+ int count = packed.charAt(i++);
char value = packed.charAt(i++);
- do map[j++] = value; while (--count > 0);
+ do map[j++] = value;
+ while (--count > 0);
}
return map;
}
-
/**
* Refills the input buffer.
*
- * @return <code>false</code>, iff there was new input.
- *
- * @exception java.io.IOException if any I/O-Error occurs
+ * @return <code>false</code>, iff there was new input.
+ * @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
@@ -414,21 +384,19 @@ public final void getText(CharTermAttribute t) {
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
- System.arraycopy(zzBuffer, zzStartRead,
- zzBuffer, 0,
- zzEndRead-zzStartRead);
+ System.arraycopy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead - zzStartRead);
/* translate stored positions */
- zzEndRead-= zzStartRead;
- zzCurrentPos-= zzStartRead;
- zzMarkedPos-= zzStartRead;
+ zzEndRead -= zzStartRead;
+ zzCurrentPos -= zzStartRead;
+ zzMarkedPos -= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
/* if not: blow it up */
- char newBuffer[] = new char[zzBuffer.length*2];
+ char newBuffer[] = new char[zzBuffer.length * 2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
zzEndRead += zzFinalHighSurrogate;
@@ -441,13 +409,14 @@ public final void getText(CharTermAttribute t) {
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
- throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ throw new java.io.IOException(
+ "Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
/* If numRead == requested, we might have requested to few chars to
- encode a full Unicode character. We assume that a Reader would
- otherwise never return half characters. */
+ encode a full Unicode character. We assume that a Reader would
+ otherwise never return half characters. */
if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
@@ -462,54 +431,42 @@ public final void getText(CharTermAttribute t) {
return true;
}
-
- /**
- * Closes the input stream.
- */
+ /** Closes the input stream. */
public final void yyclose() throws java.io.IOException {
- zzAtEOF = true; /* indicate end of file */
- zzEndRead = zzStartRead; /* invalidate buffer */
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
- if (zzReader != null)
- zzReader.close();
+ if (zzReader != null) zzReader.close();
}
-
/**
- * Resets the scanner to read from a new input stream.
- * Does not close the old reader.
+ * Resets the scanner to read from a new input stream. Does not close the old reader.
*
- * All internal variables are reset, the old input stream
- * <b>cannot</b> be reused (internal buffer is discarded and lost).
- * Lexical state is set to <code>ZZ_INITIAL</code>.
+ * <p>All internal variables are reset, the old input stream <b>cannot</b> be reused (internal
+ * buffer is discarded and lost). Lexical state is set to <code>ZZ_INITIAL</code>.
*
- * Internal scan buffer is resized down to its initial length, if it has grown.
+ * <p>Internal scan buffer is resized down to its initial length, if it has grown.
*
- * @param reader the new input stream
+ * @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
- zzAtBOL = true;
- zzAtEOF = false;
+ zzAtBOL = true;
+ zzAtEOF = false;
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
- if (zzBuffer.length > ZZ_BUFFERSIZE)
- zzBuffer = new char[ZZ_BUFFERSIZE];
+ if (zzBuffer.length > ZZ_BUFFERSIZE) zzBuffer = new char[ZZ_BUFFERSIZE];
}
-
- /**
- * Returns the current lexical state.
- */
+ /** Returns the current lexical state. */
public final int yystate() {
return zzLexicalState;
}
-
/**
* Enters a new lexical state
*
@@ -519,88 +476,71 @@ public final void getText(CharTermAttribute t) {
zzLexicalState = newState;
}
-
- /**
- * Returns the text matched by the current regular expression.
- */
+ /** Returns the text matched by the current regular expression. */
public final String yytext() {
- return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ return new String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
}
-
/**
- * Returns the character at position <code>pos</code> from the
- * matched text.
- *
- * It is equivalent to yytext().charAt(pos), but faster
+ * Returns the character at position <code>pos</code> from the matched text.
*
- * @param pos the position of the character to fetch.
- * A value from 0 to yylength()-1.
+ * <p>It is equivalent to yytext().charAt(pos), but faster
*
+ * @param pos the position of the character to fetch. A value from 0 to yylength()-1.
* @return the character at position pos
*/
public final char yycharat(int pos) {
- return zzBuffer[zzStartRead+pos];
+ return zzBuffer[zzStartRead + pos];
}
-
- /**
- * Returns the length of the matched text region.
- */
+ /** Returns the length of the matched text region. */
public final int yylength() {
- return zzMarkedPos-zzStartRead;
+ return zzMarkedPos - zzStartRead;
}
-
/**
* Reports an error that occured while scanning.
*
- * In a wellformed scanner (no or only correct usage of
- * yypushback(int) and a match-all fallback rule) this method
- * will only be called with things that "Can't Possibly Happen".
- * If this method is called, something is seriously wrong
- * (e.g. a JFlex bug producing a faulty scanner etc.).
+ * <p>In a wellformed scanner (no or only correct usage of yypushback(int) and a match-all
+ * fallback rule) this method will only be called with things that "Can't Possibly Happen". If
+ * this method is called, something is seriously wrong (e.g. a JFlex bug producing a faulty
+ * scanner etc.).
*
- * Usual syntax/scanner level error handling should be done
- * in error fallback rules.
+ * <p>Usual syntax/scanner level error handling should be done in error fallback rules.
*
- * @param errorCode the code of the errormessage to display
+ * @param errorCode the code of the errormessage to display
*/
private void zzScanError(int errorCode) {
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
- }
- catch (ArrayIndexOutOfBoundsException e) {
+ } catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
throw new Error(message);
- }
-
+ }
/**
* Pushes the specified amount of characters back into the input stream.
*
- * They will be read again by then next call of the scanning method
+ * <p>They will be read again by then next call of the scanning method
*
- * @param number the number of characters to be read again.
- * This number must not be greater than yylength()!
+ * @param number the number of characters to be read again. This number must not be greater than
+ * yylength()!
*/
- public void yypushback(int number) {
- if ( number > yylength() )
- zzScanError(ZZ_PUSHBACK_2BIG);
+ public void yypushback(int number) {
+ if (number > yylength()) zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
-
/**
- * Resumes scanning until the next regular expression is matched,
- * the end of input is encountered or an I/O-Error occurs.
+ * Resumes scanning until the next regular expression is matched, the end of input is encountered
+ * or an I/O-Error occurs.
*
- * @return the next token
- * @exception java.io.IOException if any I/O-Error occurs
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
*/
public int getNextToken() throws java.io.IOException {
int zzInput;
@@ -610,72 +550,68 @@ public final void getText(CharTermAttribute t) {
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
- char [] zzBufferL = zzBuffer;
- char [] zzCMapL = ZZ_CMAP;
+ char[] zzBufferL = zzBuffer;
+ char[] zzCMapL = ZZ_CMAP;
- int [] zzTransL = ZZ_TRANS;
- int [] zzRowMapL = ZZ_ROWMAP;
- int [] zzAttrL = ZZ_ATTRIBUTE;
+ int[] zzTransL = ZZ_TRANS;
+ int[] zzRowMapL = ZZ_ROWMAP;
+ int[] zzAttrL = ZZ_ATTRIBUTE;
while (true) {
zzMarkedPosL = zzMarkedPos;
- yychar+= zzMarkedPosL-zzStartRead;
+ yychar += zzMarkedPosL - zzStartRead;
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
-
+
zzState = ZZ_LEXSTATE[zzLexicalState];
// set up zzAction for empty match case:
int zzAttributes = zzAttrL[zzState];
- if ( (zzAttributes & 1) == 1 ) {
+ if ((zzAttributes & 1) == 1) {
zzAction = zzState;
}
-
- zzForAction: {
+ zzForAction:
+ {
while (true) {
-
+
if (zzCurrentPosL < zzEndReadL) {
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
- }
- else if (zzAtEOF) {
+ } else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
- }
- else {
+ } else {
// store back cached positions
- zzCurrentPos = zzCurrentPosL;
- zzMarkedPos = zzMarkedPosL;
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
boolean eof = zzRefill();
// get translated positions and possibly new buffer
- zzCurrentPosL = zzCurrentPos;
- zzMarkedPosL = zzMarkedPos;
- zzBufferL = zzBuffer;
- zzEndReadL = zzEndRead;
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
if (eof) {
zzInput = YYEOF;
break zzForAction;
- }
- else {
+ } else {
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
}
}
- int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]];
if (zzNext == -1) break zzForAction;
zzState = zzNext;
zzAttributes = zzAttrL[zzState];
- if ( (zzAttributes & 1) == 1 ) {
+ if ((zzAttributes & 1) == 1) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
- if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ if ((zzAttributes & 8) == 8) break zzForAction;
}
-
}
}
@@ -685,65 +621,83 @@ public final void getText(CharTermAttribute t) {
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
- }
- else {
+ } else {
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore */
- }
+ case 1:
+ {
+ /* Break so we don't hit fall-through warning: */
+ break; /* ignore */
+ }
// fall through
- case 11: break;
- case 2:
- { return ALPHANUM;
- }
+ case 11:
+ break;
+ case 2:
+ {
+ return ALPHANUM;
+ }
// fall through
- case 12: break;
- case 3:
- { return CJ;
- }
+ case 12:
+ break;
+ case 3:
+ {
+ return CJ;
+ }
// fall through
- case 13: break;
- case 4:
- { return HOST;
- }
+ case 13:
+ break;
+ case 4:
+ {
+ return HOST;
+ }
// fall through
- case 14: break;
- case 5:
- { return NUM;
- }
+ case 14:
+ break;
+ case 5:
+ {
+ return NUM;
+ }
// fall through
- case 15: break;
- case 6:
- { return APOSTROPHE;
- }
+ case 15:
+ break;
+ case 6:
+ {
+ return APOSTROPHE;
+ }
// fall through
- case 16: break;
- case 7:
- { return COMPANY;
- }
+ case 16:
+ break;
+ case 7:
+ {
+ return COMPANY;
+ }
// fall through
- case 17: break;
- case 8:
- { return ACRONYM_DEP;
- }
+ case 17:
+ break;
+ case 8:
+ {
+ return ACRONYM_DEP;
+ }
// fall through
- case 18: break;
- case 9:
- { return ACRONYM;
- }
+ case 18:
+ break;
+ case 9:
+ {
+ return ACRONYM;
+ }
// fall through
- case 19: break;
- case 10:
- { return EMAIL;
- }
+ case 19:
+ break;
+ case 10:
+ {
+ return EMAIL;
+ }
// fall through
- case 20: break;
+ case 20:
+ break;
default:
zzScanError(ZZ_NO_MATCH);
}
}
}
}
-
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex
index 07d7857..798d9a5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.standard;
+package org.apache.lucene.analysis.classic;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/package-info.java
index 4b2c471..47dc52c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/package-info.java
@@ -16,14 +16,13 @@
*/
/**
- * Fast, general-purpose grammar-based tokenizers.
- * {@link org.apache.lucene.analysis.classic.ClassicTokenizer ClassicTokenizer}:
- * this class was formerly (prior to Lucene 3.1) named
- * <code>StandardTokenizer</code>. (Its tokenization rules are not
- * based on the Unicode Text Segmentation algorithm.)
- * {@link org.apache.lucene.analysis.classic.ClassicAnalyzer ClassicAnalyzer} includes
- * {@link org.apache.lucene.analysis.classic.ClassicTokenizer ClassicTokenizer},
- * {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
- * and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ * Fast, general-purpose grammar-based tokenizers. {@link
+ * org.apache.lucene.analysis.classic.ClassicTokenizer ClassicTokenizer}: this class was formerly
+ * (prior to Lucene 3.1) named <code>StandardTokenizer</code>. (Its tokenization rules are not based
+ * on the Unicode Text Segmentation algorithm.) {@link
+ * org.apache.lucene.analysis.classic.ClassicAnalyzer ClassicAnalyzer} includes {@link
+ * org.apache.lucene.analysis.classic.ClassicTokenizer ClassicTokenizer}, {@link
+ * org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter} and {@link
+ * org.apache.lucene.analysis.StopFilter StopFilter}.
*/
-package org.apache.lucene.analysis.classic;
\ No newline at end of file
+package org.apache.lucene.analysis.classic;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
index f32b8c0..f4c2f86 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@@ -17,7 +17,6 @@
package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -28,20 +27,20 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/*
- * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
*/
/**
- * Construct bigrams for frequently occurring terms while indexing. Single terms
- * are still indexed too, with bigrams overlaid. This is achieved through the
- * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
- * of {@link #GRAM_TYPE} Example:
+ * Construct bigrams for frequently occurring terms while indexing. Single terms are still indexed
+ * too, with bigrams overlaid. This is achieved through the use of {@link
+ * PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type of {@link #GRAM_TYPE}
+ * Example:
+ *
* <ul>
- * <li>input:"the quick brown fox"</li>
- * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
- * <li>"the-quick" has a position increment of 0 so it is in the same position
- * as "the" "the-quick" has a term.type() of "gram"</li>
- *
+ * <li>input:"the quick brown fox"
+ * <li>output:|"the","the-quick"|"brown"|"fox"|
+ * <li>"the-quick" has a position increment of 0 so it is in the same position as "the"
+ * "the-quick" has a term.type() of "gram"
* </ul>
*/
@@ -56,23 +55,24 @@ public final class CommonGramsFilter extends TokenFilter {
private final CharArraySet commonWords;
private final StringBuilder buffer = new StringBuilder();
-
+
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
- private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute =
+ addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAttribute =
+ addAttribute(PositionLengthAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
private State savedState;
/**
- * Construct a token stream filtering the given input using a Set of common
- * words to create bigrams. Outputs both unigrams with position increment and
- * bigrams with position increment 0 type=gram where one or both of the words
- * in a potential bigram are in the set of common words .
- *
+ * Construct a token stream filtering the given input using a Set of common words to create
+ * bigrams. Outputs both unigrams with position increment and bigrams with position increment 0
+ * type=gram where one or both of the words in a potential bigram are in the set of common words .
+ *
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
*/
@@ -82,20 +82,17 @@ public final class CommonGramsFilter extends TokenFilter {
}
/**
- * Inserts bigrams for common words into a token stream. For each input token,
- * output the token. If the token and/or the following token are in the list
- * of common words also output a bigram with position increment 0 and
- * type="gram"
+ * Inserts bigrams for common words into a token stream. For each input token, output the token.
+ * If the token and/or the following token are in the list of common words also output a bigram
+ * with position increment 0 and type="gram"
*
- * TODO:Consider adding an option to not emit unigram stopwords
- * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
- * changed to work with this.
+ * <p>TODO:Consider adding an option to not emit unigram stopwords as in CDL XTF BigramStopFilter,
+ * CommonGramsQueryFilter would need to be changed to work with this.
*
- * TODO: Consider optimizing for the case of three
- * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
- * "of-the", "the-year" but with proper management of positions we could
- * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
- * position lookups.
+ * <p>TODO: Consider optimizing for the case of three commongrams i.e "man of the year" normally
+ * produces 3 bigrams: "man-of", "of-the", "the-year" but with proper management of positions we
+ * could eliminate the middle bigram "of-the"and save a disk seek and a whole set of position
+ * lookups.
*/
@Override
public boolean incrementToken() throws IOException {
@@ -108,15 +105,15 @@ public final class CommonGramsFilter extends TokenFilter {
} else if (!input.incrementToken()) {
return false;
}
-
- /* We build n-grams before and after stopwords.
+
+ /* We build n-grams before and after stopwords.
* When valid, the buffer always contains at least the separator.
* If it's empty, there is nothing before this stopword.
*/
if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
savedState = captureState();
gramToken();
- return true;
+ return true;
}
saveTermBuffer();
@@ -131,20 +128,17 @@ public final class CommonGramsFilter extends TokenFilter {
buffer.setLength(0);
}
- // ================================================= Helper Methods ================================================
-
/**
* Determines if the current token is a common term
*
* @return {@code true} if the current token is a common term, {@code false} otherwise
*/
private boolean isCommon() {
- return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
+ return commonWords != null
+ && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
}
- /**
- * Saves this information to form the left part of a gram
- */
+ /** Saves this information to form the left part of a gram */
private void saveTermBuffer() {
buffer.setLength(0);
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
@@ -153,9 +147,7 @@ public final class CommonGramsFilter extends TokenFilter {
lastWasCommon = isCommon();
}
- /**
- * Constructs a compound token.
- */
+ /** Constructs a compound token. */
private void gramToken() {
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
int endOffset = offsetAttribute.endOffset();
@@ -167,7 +159,7 @@ public final class CommonGramsFilter extends TokenFilter {
if (length > termText.length) {
termText = termAttribute.resizeBuffer(length);
}
-
+
buffer.getChars(0, length, termText, 0);
termAttribute.setLength(length);
posIncAttribute.setPositionIncrement(0);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index a181f4a..ccf26bd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -16,20 +16,19 @@
*/
package org.apache.lucene.analysis.commongrams;
-
import java.io.IOException;
import java.util.Map;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.TokenFilterFactory;
/**
* Constructs a {@link CommonGramsFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -51,9 +50,9 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
private final String commonWordFiles;
private final String format;
private final boolean ignoreCase;
-
+
/** Creates a new CommonGramsFilterFactory */
- public CommonGramsFilterFactory(Map<String,String> args) {
+ public CommonGramsFilterFactory(Map<String, String> args) {
super(args);
commonWordFiles = get(args, "words");
format = get(args, "format");
@@ -95,6 +94,3 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
return commonGrams;
}
}
-
-
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
index e8c98b7..80a6381 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
@@ -16,26 +16,25 @@
*/
package org.apache.lucene.analysis.commongrams;
-import java.io.IOException;
+import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
+import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
-
/**
- * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
- * words when they are not a member of a bigram.
- *
- * Example:
+ * Wrap a CommonGramsFilter optimizing phrase queries by only returning single words when they are
+ * not a member of a bigram.
+ *
+ * <p>Example:
+ *
* <ul>
- * <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
- * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
- * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
- * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
- * "falls", "mainly"
+ * <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+ * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter: |"the, "the-rain"|"rain"
+ * "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+ * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
* </ul>
*/
@@ -46,16 +45,18 @@ import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE
public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
- private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLengthAttribute = addAttribute(PositionLengthAttribute.class);
-
+ private final PositionIncrementAttribute posIncAttribute =
+ addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLengthAttribute =
+ addAttribute(PositionLengthAttribute.class);
+
private State previous;
private String previousType;
private boolean exhausted;
/**
- * Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
- *
+ * Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
+ *
* @param input CommonGramsFilter the QueryFilter will use
*/
public CommonGramsQueryFilter(CommonGramsFilter input) {
@@ -69,13 +70,14 @@ public final class CommonGramsQueryFilter extends TokenFilter {
previousType = null;
exhausted = false;
}
-
+
/**
- * Output bigrams whenever possible to optimize queries. Only output unigrams
- * when they are not a member of a bigram. Example:
+ * Output bigrams whenever possible to optimize queries. Only output unigrams when they are not a
+ * member of a bigram. Example:
+ *
* <ul>
- * <li>input: "the rain in spain falls mainly"
- * <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+ * <li>input: "the rain in spain falls mainly"
+ * <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
* </ul>
*/
@Override
@@ -87,10 +89,11 @@ public final class CommonGramsQueryFilter extends TokenFilter {
restoreState(previous);
previous = current;
previousType = typeAttribute.type();
-
+
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
- // We must set this back to 1 (from e.g. 2 or higher) otherwise the token graph is disconnected:
+ // We must set this back to 1 (from e.g. 2 or higher) otherwise the token graph is
+ // disconnected:
posLengthAttribute.setPositionLength(1);
}
return true;
@@ -104,23 +107,22 @@ public final class CommonGramsQueryFilter extends TokenFilter {
if (previous == null || GRAM_TYPE.equals(previousType)) {
return false;
}
-
+
restoreState(previous);
previous = null;
-
+
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
- // We must set this back to 1 (from e.g. 2 or higher) otherwise the token graph is disconnected:
+ // We must set this back to 1 (from e.g. 2 or higher) otherwise the token graph is
+ // disconnected:
posLengthAttribute.setPositionLength(1);
}
return true;
}
- // ================================================= Helper Methods ================================================
-
/**
* Convenience method to check if the current type is a gram type
- *
+ *
* @return {@code true} if the current type is a gram type, {@code false} otherwise
*/
public boolean isGramType() {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java
index f8e712e..0e19d78 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java
@@ -16,15 +16,13 @@
*/
package org.apache.lucene.analysis.commongrams;
-
import java.util.Map;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Construct {@link CommonGramsQueryFilter}.
- *
+ *
* <pre class="prettyprint">
* <fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -41,7 +39,7 @@ public class CommonGramsQueryFilterFactory extends CommonGramsFilterFactory {
public static final String NAME = "commonGramsQuery";
/** Creates a new CommonGramsQueryFilterFactory */
- public CommonGramsQueryFilterFactory(Map<String,String> args) {
+ public CommonGramsQueryFilterFactory(Map<String, String> args) {
super(args);
}
@@ -50,9 +48,7 @@ public class CommonGramsQueryFilterFactory extends CommonGramsFilterFactory {
throw defaultCtorException();
}
- /**
- * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
- */
+ /** Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter */
@Override
public TokenFilter create(TokenStream input) {
CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/package-info.java
index 4c9eaba..ba72f50 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/package-info.java
@@ -15,7 +15,5 @@
* limitations under the License.
*/
-/**
- * Construct n-grams for frequently occurring terms and phrases.
- */
+/** Construct n-grams for frequently occurring terms and phrases. */
package org.apache.lucene.analysis.commongrams;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 680e67a..bc5f4b8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -16,10 +16,8 @@
*/
package org.apache.lucene.analysis.compound;
-
import java.io.IOException;
import java.util.LinkedList;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -27,23 +25,15 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-/**
- * Base class for decomposition token filters.
- */
+/** Base class for decomposition token filters. */
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
- /**
- * The default for minimal word length that gets decomposed
- */
+ /** The default for minimal word length that gets decomposed */
public static final int DEFAULT_MIN_WORD_SIZE = 5;
- /**
- * The default for minimal length of subwords that get propagated to the output of this filter
- */
+ /** The default for minimal length of subwords that get propagated to the output of this filter */
public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
- /**
- * The default for maximal length of subwords that get propagated to the output of this filter
- */
+ /** The default for maximal length of subwords that get propagated to the output of this filter */
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected final CharArraySet dictionary;
@@ -55,34 +45,54 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionIncrementAttribute posIncAtt =
+ addAttribute(PositionIncrementAttribute.class);
private State current;
- protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
- this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+ protected CompoundWordTokenFilterBase(
+ TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
+ this(
+ input,
+ dictionary,
+ DEFAULT_MIN_WORD_SIZE,
+ DEFAULT_MIN_SUBWORD_SIZE,
+ DEFAULT_MAX_SUBWORD_SIZE,
+ onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) {
- this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+ this(
+ input,
+ dictionary,
+ DEFAULT_MIN_WORD_SIZE,
+ DEFAULT_MIN_SUBWORD_SIZE,
+ DEFAULT_MAX_SUBWORD_SIZE,
+ false);
}
- protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+ protected CompoundWordTokenFilterBase(
+ TokenStream input,
+ CharArraySet dictionary,
+ int minWordSize,
+ int minSubwordSize,
+ int maxSubwordSize,
+ boolean onlyLongestMatch) {
super(input);
- this.tokens=new LinkedList<>();
+ this.tokens = new LinkedList<>();
if (minWordSize < 0) {
throw new IllegalArgumentException("minWordSize cannot be negative");
}
- this.minWordSize=minWordSize;
+ this.minWordSize = minWordSize;
if (minSubwordSize < 0) {
throw new IllegalArgumentException("minSubwordSize cannot be negative");
}
- this.minSubwordSize=minSubwordSize;
+ this.minSubwordSize = minSubwordSize;
if (maxSubwordSize < 0) {
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
}
- this.maxSubwordSize=maxSubwordSize;
- this.onlyLongestMatch=onlyLongestMatch;
+ this.maxSubwordSize = maxSubwordSize;
+ this.onlyLongestMatch = onlyLongestMatch;
this.dictionary = dictionary;
}
@@ -115,8 +125,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
}
}
- /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
- * The original token may not be placed in the list, as it is automatically passed through this filter.
+ /**
+ * Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the
+ * {@link #tokens} list. The original token may not be placed in the list, as it is automatically
+ * passed through this filter.
*/
protected abstract void decompose();
@@ -127,21 +139,21 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
current = null;
}
- /**
- * Helper class to hold decompounded token information
- */
+ /** Helper class to hold decompounded token information */
protected class CompoundToken {
public final CharSequence txt;
public final int startOffset, endOffset;
- /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
+ /**
+ * Construct the compound token based on a slice of the current {@link
+ * CompoundWordTokenFilterBase#termAtt}.
+ */
public CompoundToken(int offset, int length) {
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
-
+
// offsets of the original word
this.startOffset = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
this.endOffset = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
}
-
- }
+ }
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
index 2e4b837..c6278a8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -16,27 +16,23 @@
*/
package org.apache.lucene.analysis.compound;
-
-
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
/**
- * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
- * <p>
- * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
- * "Donaudampfschiff" even when you only enter "schiff".
- * It uses a brute-force algorithm to achieve this.
+ * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many
+ * Germanic languages.
+ *
+ * <p>"Donaudampfschiff" becomes Donau, dampf, schiff so that you can find "Donaudampfschiff" even
+ * when you only enter "schiff". It uses a brute-force algorithm to achieve this.
*/
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
- * @param input
- * the {@link org.apache.lucene.analysis.TokenStream} to process
- * @param dictionary
- * the word dictionary to match against.
+ * @param input the {@link org.apache.lucene.analysis.TokenStream} to process
+ * @param dictionary the word dictionary to match against.
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary) {
super(input, dictionary);
@@ -48,21 +44,20 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
- * @param input
- * the {@link org.apache.lucene.analysis.TokenStream} to process
- * @param dictionary
- * the word dictionary to match against.
- * @param minWordSize
- * only words longer than this get processed
- * @param minSubwordSize
- * only subwords longer than this get to the output stream
- * @param maxSubwordSize
- * only subwords shorter than this get to the output stream
- * @param onlyLongestMatch
- * Add only the longest matching subword to the stream
+ * @param input the {@link org.apache.lucene.analysis.TokenStream} to process
+ * @param dictionary the word dictionary to match against.
+ * @param minWordSize only words longer than this get processed
+ * @param minSubwordSize only subwords longer than this get to the output stream
+ * @param maxSubwordSize only subwords shorter than this get to the output stream
+ * @param onlyLongestMatch Add only the longest matching subword to the stream
*/
- public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary,
- int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+ public DictionaryCompoundWordTokenFilter(
+ TokenStream input,
+ CharArraySet dictionary,
+ int minWordSize,
+ int minSubwordSize,
+ int maxSubwordSize,
+ boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
if (dictionary == null) {
throw new IllegalArgumentException("dictionary must not be null");
@@ -72,29 +67,29 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
@Override
protected void decompose() {
final int len = termAtt.length();
- for (int i=0;i<=len-this.minSubwordSize;++i) {
- CompoundToken longestMatchToken=null;
- for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
- if(i+j>len) {
- break;
- }
- if(dictionary.contains(termAtt.buffer(), i, j)) {
- if (this.onlyLongestMatch) {
- if (longestMatchToken!=null) {
- if (longestMatchToken.txt.length()<j) {
- longestMatchToken=new CompoundToken(i,j);
- }
- } else {
- longestMatchToken=new CompoundToken(i,j);
- }
- } else {
- tokens.add(new CompoundToken(i,j));
- }
- }
+ for (int i = 0; i <= len - this.minSubwordSize; ++i) {
+ CompoundToken longestMatchToken = null;
+ for (int j = this.minSubwordSize; j <= this.maxSubwordSize; ++j) {
+ if (i + j > len) {
+ break;
}
- if (this.onlyLongestMatch && longestMatchToken!=null) {
- tokens.add(longestMatchToken);
+ if (dictionary.contains(termAtt.buffer(), i, j)) {
+ if (this.onlyLongestMatch) {
+ if (longestMatchToken != null) {
+ if (longestMatchToken.txt.length() < j) {
+ longestMatchToken = new CompoundToken(i, j);
+ }
+ } else {
+ longestMatchToken = new CompoundToken(i, j);
+ }
+ } else {
+ tokens.add(new CompoundToken(i, j));
+ }
}
+ }
+ if (this.onlyLongestMatch && longestMatchToken != null) {
+ tokens.add(longestMatchToken);
+ }
}
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
index 347a5ba..6981973 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
@@ -16,18 +16,17 @@
*/
package org.apache.lucene.analysis.compound;
-
import java.io.IOException;
import java.util.Map;
-
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.TokenFilterFactory;
-/**
+/**
* Factory for {@link DictionaryCompoundWordTokenFilter}.
+ *
* <pre class="prettyprint">
* <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -40,7 +39,8 @@ import org.apache.lucene.analysis.TokenFilterFactory;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
-public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
+ implements ResourceLoaderAware {
/** SPI name */
public static final String NAME = "dictionaryCompoundWord";
@@ -57,14 +57,16 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
super(args);
dictFile = require(args, "dictionary");
minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
- minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
- maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+ minSubwordSize =
+ getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+ maxSubwordSize =
+ getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public DictionaryCompoundWordTokenFilterFactory() {
throw defaultCtorException();
@@ -74,14 +76,14 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
public void inform(ResourceLoader loader) throws IOException {
dictionary = super.getWordSet(loader, dictFile, false);
}
-
+
@Override
public TokenStream create(TokenStream input) {
// if the dictionary is null, it means it was empty
if (dictionary == null) {
return input;
}
- return new DictionaryCompoundWordTokenFilter(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+ return new DictionaryCompoundWordTokenFilter(
+ input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
}
-
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index 41f92c9..3acb5c2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -17,7 +17,6 @@
package org.apache.lucene.analysis.compound;
import java.io.IOException;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
@@ -25,83 +24,91 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.xml.sax.InputSource;
/**
- * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
+ * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many
+ * Germanic languages.
*
- * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
- * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
- * grammar and a word dictionary to achieve this.
+ * <p>"Donaudampfschiff" becomes Donau, dampf, schiff so that you can find "Donaudampfschiff" even
+ * when you only enter "schiff". It uses a hyphenation grammar and a word dictionary to achieve
+ * this.
*/
-public class HyphenationCompoundWordTokenFilter extends
- CompoundWordTokenFilterBase {
+public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
private HyphenationTree hyphenator;
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
- * @param input
- * the {@link org.apache.lucene.analysis.TokenStream} to process
- * @param hyphenator
- * the hyphenation pattern tree to use for hyphenation
- * @param dictionary
- * the word dictionary to match against.
+ * @param input the {@link org.apache.lucene.analysis.TokenStream} to process
+ * @param hyphenator the hyphenation pattern tree to use for hyphenation
+ * @param dictionary the word dictionary to match against.
*/
- public HyphenationCompoundWordTokenFilter(TokenStream input,
- HyphenationTree hyphenator, CharArraySet dictionary) {
- this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
- DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
+ public HyphenationCompoundWordTokenFilter(
+ TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary) {
+ this(
+ input,
+ hyphenator,
+ dictionary,
+ DEFAULT_MIN_WORD_SIZE,
+ DEFAULT_MIN_SUBWORD_SIZE,
+ DEFAULT_MAX_SUBWORD_SIZE,
+ false);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
- * @param input
- * the {@link org.apache.lucene.analysis.TokenStream} to process
- * @param hyphenator
- * the hyphenation pattern tree to use for hyphenation
- * @param dictionary
- * the word dictionary to match against.
- * @param minWordSize
- * only words longer than this get processed
- * @param minSubwordSize
- * only subwords longer than this get to the output stream
- * @param maxSubwordSize
- * only subwords shorter than this get to the output stream
- * @param onlyLongestMatch
- * Add only the longest matching subword to the stream
+ * @param input the {@link org.apache.lucene.analysis.TokenStream} to process
+ * @param hyphenator the hyphenation pattern tree to use for hyphenation
+ * @param dictionary the word dictionary to match against.
+ * @param minWordSize only words longer than this get processed
+ * @param minSubwordSize only subwords longer than this get to the output stream
+ * @param maxSubwordSize only subwords shorter than this get to the output stream
+ * @param onlyLongestMatch Add only the longest matching subword to the stream
*/
- public HyphenationCompoundWordTokenFilter(TokenStream input,
- HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
- int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
- onlyLongestMatch);
+ public HyphenationCompoundWordTokenFilter(
+ TokenStream input,
+ HyphenationTree hyphenator,
+ CharArraySet dictionary,
+ int minWordSize,
+ int minSubwordSize,
+ int maxSubwordSize,
+ boolean onlyLongestMatch) {
+ super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
this.hyphenator = hyphenator;
}
/**
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
- * <p>
- * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
- * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
- * null, minWordSize, minSubwordSize, maxSubwordSize }
+ *
+ * <p>Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream,
+ * org.apache.lucene.analysis.compound.hyphenation.HyphenationTree,
+ * org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
+ * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, null, minWordSize,
+ * minSubwordSize, maxSubwordSize }
*/
- public HyphenationCompoundWordTokenFilter(TokenStream input,
- HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
- int maxSubwordSize) {
- this(input, hyphenator, null, minWordSize, minSubwordSize,
- maxSubwordSize, false);
+ public HyphenationCompoundWordTokenFilter(
+ TokenStream input,
+ HyphenationTree hyphenator,
+ int minWordSize,
+ int minSubwordSize,
+ int maxSubwordSize) {
+ this(input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false);
}
/**
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
- * <p>
- * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
- * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
- * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+ *
+ * <p>Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream,
+ * org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
+ * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE,
+ * DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
*/
- public HyphenationCompoundWordTokenFilter(TokenStream input,
- HyphenationTree hyphenator) {
- this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
+ public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator) {
+ this(
+ input,
+ hyphenator,
+ DEFAULT_MIN_WORD_SIZE,
+ DEFAULT_MIN_SUBWORD_SIZE,
DEFAULT_MAX_SUBWORD_SIZE);
}
@@ -112,8 +119,7 @@ public class HyphenationCompoundWordTokenFilter extends
* @return An object representing the hyphenation patterns
* @throws java.io.IOException If there is a low-level I/O error.
*/
- public static HyphenationTree getHyphenationTree(String hyphenationFilename)
- throws IOException {
+ public static HyphenationTree getHyphenationTree(String hyphenationFilename) throws IOException {
return getHyphenationTree(new InputSource(hyphenationFilename));
}
@@ -158,7 +164,7 @@ public class HyphenationCompoundWordTokenFilter extends
// we only put subwords to the token stream
// that are longer than minPartSize
if (partLength < this.minSubwordSize) {
- // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
+ // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
continue;
}
@@ -194,7 +200,7 @@ public class HyphenationCompoundWordTokenFilter extends
}
}
}
- if (this.onlyLongestMatch && longestMatchToken!=null) {
+ if (this.onlyLongestMatch && longestMatchToken != null) {
tokens.add(longestMatchToken);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
index 2d70346..a1f8809 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java
@@ -16,37 +16,39 @@
*/
package org.apache.lucene.analysis.compound;
-
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
-
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.TokenFilterFactory;
-import org.apache.lucene.util.IOUtils;
import org.xml.sax.InputSource;
/**
* Factory for {@link HyphenationCompoundWordTokenFilter}.
- * <p>
- * This factory accepts the following parameters:
+ *
+ * <p>This factory accepts the following parameters:
+ *
* <ul>
- * <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern.
- * See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
- * <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
- * <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
- * <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
- * <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
- * <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
- * <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
- * to the stream. defaults to false.
+ * <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern. See <a
+ * href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
+ * <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
+ * <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
+ * <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to
+ * 5.
+ * <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
+ * <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
+ * <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
+ * to the stream. defaults to false.
* </ul>
+ *
* <br>
+ *
* <pre class="prettyprint">
* <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
@@ -60,7 +62,8 @@ import org.xml.sax.InputSource;
* @since 3.1.0
* @lucene.spi {@value #NAME}
*/
-public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactory
+ implements ResourceLoaderAware {
/** SPI name */
public static final String NAME = "hyphenationCompoundWord";
@@ -74,7 +77,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
private final int minSubwordSize;
private final int maxSubwordSize;
private final boolean onlyLongestMatch;
-
+
/** Creates a new HyphenationCompoundWordTokenFilterFactory */
public HyphenationCompoundWordTokenFilterFactory(Map<String, String> args) {
super(args);
@@ -82,14 +85,16 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
encoding = get(args, "encoding");
hypFile = require(args, "hyphenator");
minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
- minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
- maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+ minSubwordSize =
+ getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+ maxSubwordSize =
+ getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
-
+
/** Default ctor for compatibility with SPI */
public HyphenationCompoundWordTokenFilterFactory() {
throw defaultCtorException();
@@ -100,7 +105,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
InputStream stream = null;
try {
if (dictFile != null) // the dictionary can be empty.
- dictionary = getWordSet(loader, dictFile, false);
+ dictionary = getWordSet(loader, dictFile, false);
// TODO: Broken, because we cannot resolve real system id
// ResourceLoader should also supply method like ClassLoader to get resource URL
stream = loader.openResource(hypFile);
@@ -112,9 +117,16 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
IOUtils.closeWhileHandlingException(stream);
}
}
-
+
@Override
public TokenFilter create(TokenStream input) {
- return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+ return new HyphenationCompoundWordTokenFilter(
+ input,
+ hyphenator,
+ dictionary,
+ minWordSize,
+ minSubwordSize,
+ maxSubwordSize,
+ onlyLongestMatch);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
index 8e83893..6531bac 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
@@ -18,27 +18,21 @@
package org.apache.lucene.analysis.compound.hyphenation;
/**
- * This class implements a simple byte vector with access to the underlying
- * array.
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * This class implements a simple byte vector with access to the underlying array. This class has
+ * been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been
+ * slightly modified.
*/
public class ByteVector {
- /**
- * Capacity increment size
- */
+ /** Capacity increment size */
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
- /**
- * The encapsulated array
- */
+ /** The encapsulated array */
private byte[] array;
- /**
- * Points to next free item
- */
+ /** Points to next free item */
private int n;
public ByteVector() {
@@ -75,16 +69,12 @@ public class ByteVector {
return array;
}
- /**
- * return number of items in array
- */
+ /** return number of items in array */
public int length() {
return n;
}
- /**
- * returns current capacity of array
- */
+ /** returns current capacity of array */
public int capacity() {
return array.length;
}
@@ -97,9 +87,7 @@ public class ByteVector {
return array[index];
}
- /**
- * This is to implement memory allocation in the array. Like malloc().
- */
+ /** This is to implement memory allocation in the array. Like malloc(). */
public int alloc(int size) {
int index = n;
int len = array.length;
@@ -119,5 +107,4 @@ public class ByteVector {
array = aux;
}
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
index c61ac19..1161b8b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
@@ -17,28 +17,22 @@
package org.apache.lucene.analysis.compound.hyphenation;
/**
- * This class implements a simple char vector with access to the underlying
- * array.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * This class implements a simple char vector with access to the underlying array.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
public class CharVector implements Cloneable {
- /**
- * Capacity increment size
- */
+ /** Capacity increment size */
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
- /**
- * The encapsulated array
- */
+ /** The encapsulated array */
private char[] array;
- /**
- * Points to next free item
- */
+ /** Points to next free item */
private int n;
public CharVector() {
@@ -71,9 +65,7 @@ public class CharVector implements Cloneable {
n = a.length;
}
- /**
- * Reset Vector but don't resize or clear elements
- */
+ /** Reset Vector but don't resize or clear elements */
public void clear() {
n = 0;
}
@@ -89,16 +81,12 @@ public class CharVector implements Cloneable {
return array;
}
- /**
- * return number of items in array
- */
+ /** return number of items in array */
public int length() {
return n;
}
- /**
- * returns current capacity of array
- */
+ /** returns current capacity of array */
public int capacity() {
return array.length;
}
@@ -130,5 +118,4 @@ public class CharVector implements Cloneable {
array = aux;
}
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
index 1845f25..db92ce2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
@@ -17,18 +17,16 @@
package org.apache.lucene.analysis.compound.hyphenation;
/**
- * This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
- * pre-break text, post-break text and no-break. If no line-break is generated
- * at this position, the no-break text is used, otherwise, pre-break and
- * post-break are used. Typically, pre-break is equal to the hyphen character
- * and the others are empty. However, this general scheme allows support for
- * cases in some languages where words change spelling if they're split across
- * lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
- * from TeX.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * This class represents a hyphen. A 'full' hyphen is made of 3 parts: the pre-break text,
+ * post-break text and no-break. If no line-break is generated at this position, the no-break text
+ * is used, otherwise, pre-break and post-break are used. Typically, pre-break is equal to the
+ * hyphen character and the others are empty. However, this general scheme allows support for cases
+ * in some languages where words change spelling if they're split across lines, like german's
+ * 'backen' which hyphenates 'bak-ken'. BTW, this comes from TeX.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
-
public class Hyphen {
public String preBreak;
@@ -50,8 +48,7 @@ public class Hyphen {
@Override
public String toString() {
- if (noBreak == null && postBreak == null && preBreak != null
- && preBreak.equals("-")) {
+ if (noBreak == null && postBreak == null && preBreak != null && preBreak.equals("-")) {
return "-";
}
StringBuilder res = new StringBuilder("{");
@@ -63,5 +60,4 @@ public class Hyphen {
res.append('}');
return res.toString();
}
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
index 3fb1e04..6b28ea5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
@@ -18,30 +18,25 @@ package org.apache.lucene.analysis.compound.hyphenation;
/**
* This class represents a hyphenated word.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
public class Hyphenation {
private int[] hyphenPoints;
- /**
- * rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
- */
+ /** rawWord as made of alternating strings and {@link Hyphen Hyphen} instances */
Hyphenation(int[] points) {
hyphenPoints = points;
}
- /**
- * @return the number of hyphenation points in the word
- */
+ /** @return the number of hyphenation points in the word */
public int length() {
return hyphenPoints.length;
}
- /**
- * @return the hyphenation points
- */
+ /** @return the hyphenation points */
public int[] getHyphenationPoints() {
return hyphenPoints;
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
index 70ad08e..acc0955 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
@@ -20,35 +20,27 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
-
import org.xml.sax.InputSource;
/**
- * This tree structure stores the hyphenation patterns in an efficient way for
- * fast lookup. It provides the provides the method to hyphenate a word.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * This tree structure stores the hyphenation patterns in an efficient way for fast lookup. It
+ * provides the provides the method to hyphenate a word.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
public class HyphenationTree extends TernaryTree implements PatternConsumer {
- /**
- * value space: stores the interletter values
- */
+ /** value space: stores the interletter values */
protected ByteVector vspace;
- /**
- * This map stores hyphenation exceptions
- */
- protected HashMap<String,ArrayList<Object>> stoplist;
+ /** This map stores hyphenation exceptions */
+ protected HashMap<String, ArrayList<Object>> stoplist;
- /**
- * This map stores the character classes
- */
+ /** This map stores the character classes */
protected TernaryTree classmap;
- /**
- * Temporary map to store interletter values on pattern loading.
- */
+ /** Temporary map to store interletter values on pattern loading. */
private transient TernaryTree ivalues;
public HyphenationTree() {
@@ -59,12 +51,10 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
/**
- * Packs the values by storing them in 4 bits, two values into a byte Values
- * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
- * value.
- *
- * @param values a string of digits from '0' to '9' representing the
- * interletter values.
+ * Packs the values by storing them in 4 bits, two values into a byte Values range is from 0 to 9.
+ * We use zero as terminator, so we'll add 1 to the value.
+ *
+ * @param values a string of digits from '0' to '9' representing the interletter values.
* @return the index into the vspace array where the packed values are stored.
*/
protected int packValues(String values) {
@@ -89,7 +79,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
- char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0');
+ char c = (char) (((v & 0xf0) >>> 4) - 1 + '0');
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
@@ -104,7 +94,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
/**
* Read hyphenation patterns from an XML file.
- *
+ *
* @param source the InputSource for the file
* @throws IOException In case the parsing fails
*/
@@ -132,9 +122,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
return "";
}
- /**
- * String compare, returns 0 if equal or t is a substring of s
- */
+ /** String compare, returns 0 if equal or t is a substring of s */
protected int hstrcmp(char[] s, int si, char[] t, int ti) {
for (; s[si] == t[ti]; si++, ti++) {
if (s[si] == 0) {
@@ -151,7 +139,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
- char c = (char) (((v & 0xf0 )>>> 4) - 1);
+ char c = (char) (((v & 0xf0) >>> 4) - 1);
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
@@ -169,27 +157,21 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
/**
- * <p>
- * Search for all possible partial matches of word starting at index an update
- * interletter values. In other words, it does something like:
- * </p>
- * <code>
+ * Search for all possible partial matches of word starting at index an update interletter values.
+ * In other words, it does something like: <code>
* for(i=0; i<patterns.length; i++) {
* if ( word.substring(index).startsWidth(patterns[i]) )
* update_interletter_values(patterns[i]);
* }
* </code>
- * <p>
- * But it is done in an efficient way since the patterns are stored in a
- * ternary tree. In fact, this is the whole purpose of having the tree: doing
- * this search without having to test every single pattern. The number of
- * patterns for languages such as English range from 4000 to 10000. Thus,
- * doing thousands of string comparisons for each word to hyphenate would be
- * really slow without the tree. The tradeoff is memory, but using a ternary
- * tree instead of a trie, almost halves the memory used by Lout or TeX.
- * It's also faster than using a hash table
- * </p>
- *
+ *
+ * <p>But it is done in an efficient way since the patterns are stored in a ternary tree. In fact,
+ * this is the whole purpose of having the tree: doing this search without having to test every
+ * single pattern. The number of patterns for languages such as English range from 4000 to 10000.
+ * Thus, doing thousands of string comparisons for each word to hyphenate would be really slow
+ * without the tree. The tradeoff is memory, but using a ternary tree instead of a trie, almost
+ * halves the memory used by Lout or TeX. It's also faster than using a hash table
+ *
* @param word null terminated word to match
* @param index start index from word
* @param il interletter values array to update
@@ -244,8 +226,8 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
q = lo[q];
/**
- * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
- * java chars are unsigned
+ * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but java chars are
+ * unsigned
*/
}
}
@@ -257,50 +239,42 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
/**
* Hyphenate word and return a Hyphenation object.
- *
+ *
* @param word the word to be hyphenated
- * @param remainCharCount Minimum number of characters allowed before the
- * hyphenation point.
- * @param pushCharCount Minimum number of characters allowed after the
- * hyphenation point.
- * @return a {@link Hyphenation Hyphenation} object representing the
- * hyphenated word or null if word is not hyphenated.
+ * @param remainCharCount Minimum number of characters allowed before the hyphenation point.
+ * @param pushCharCount Minimum number of characters allowed after the hyphenation point.
+ * @return a {@link Hyphenation Hyphenation} object representing the hyphenated word or null if
+ * word is not hyphenated.
*/
- public Hyphenation hyphenate(String word, int remainCharCount,
- int pushCharCount) {
+ public Hyphenation hyphenate(String word, int remainCharCount, int pushCharCount) {
char[] w = word.toCharArray();
return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
}
/**
- * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
- * may be absent, the first n is at offset, the first l is at offset +
- * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
- * into word. In the first part of the routine len = w.length, in the second
- * part of the routine len = word.length. Three indices are used: index(w),
- * the index in w, index(word), the index in word, letterindex(word), the
- * index in the letter part of word. The following relations exist: index(w) =
- * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
- * index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
- * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
- * iIgnoreAtBeginning
+ * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n may be absent, the
+ * first n is at offset, the first l is at offset + iIgnoreAtBeginning; word = ".llllll.'\0'***",
+ * where all l in w are copied into word. In the first part of the routine len = w.length, in the
+ * second part of the routine len = word.length. Three indices are used: index(w), the index in w,
+ * index(word), the index in word, letterindex(word), the index in the letter part of word. The
+ * following relations exist: index(w) = offset + i - 1 index(word) = i - iIgnoreAtBeginning
+ * letterindex(word) = index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+ * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + iIgnoreAtBeginning
*/
/**
* Hyphenate word and return an array of hyphenation points.
- *
+ *
* @param w char array that contains the word
* @param offset Offset to first character in word
* @param len Length of word
- * @param remainCharCount Minimum number of characters allowed before the
- * hyphenation point.
- * @param pushCharCount Minimum number of characters allowed after the
- * hyphenation point.
- * @return a {@link Hyphenation Hyphenation} object representing the
- * hyphenated word or null if word is not hyphenated.
+ * @param remainCharCount Minimum number of characters allowed before the hyphenation point.
+ * @param pushCharCount Minimum number of characters allowed after the hyphenation point.
+ * @return a {@link Hyphenation Hyphenation} object representing the hyphenated word or null if
+ * word is not hyphenated.
*/
- public Hyphenation hyphenate(char[] w, int offset, int len,
- int remainCharCount, int pushCharCount) {
+ public Hyphenation hyphenate(
+ char[] w, int offset, int len, int remainCharCount, int pushCharCount) {
int i;
char[] word = new char[len + 3];
@@ -370,8 +344,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
// i + 1 is index(word),
// result[k] = corresponding index(w)
for (i = 0; i < len; i++) {
- if (((il[i + 1] & 1) == 1) && i >= remainCharCount
- && i <= (len - pushCharCount)) {
+ if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount)) {
result[k++] = i + iIgnoreAtBeginning;
}
}
@@ -379,12 +352,12 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
if (k > 0) {
// trim result array
- int[] res = new int[k+2];
+ int[] res = new int[k + 2];
System.arraycopy(result, 0, res, 1, k);
// We add the synthetical hyphenation points
// at the beginning and end of the word
- res[0]=0;
- res[k+1]=len;
+ res[0] = 0;
+ res[k + 1] = len;
return new Hyphenation(res);
} else {
return null;
@@ -392,14 +365,13 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
/**
- * Add a character class to the tree. It is used by
- * {@link PatternParser PatternParser} as callback to add character classes.
- * Character classes define the valid word characters for hyphenation. If a
- * word contains a character not defined in any of the classes, it is not
- * hyphenated. It also defines a way to normalize the characters in order to
- * compare them with the stored patterns. Usually pattern files use only lower
- * case characters, in this case a class for letter 'a', for example, should
- * be defined as "aA", the first character being the normalization char.
+ * Add a character class to the tree. It is used by {@link PatternParser PatternParser} as
+ * callback to add character classes. Character classes define the valid word characters for
+ * hyphenation. If a word contains a character not defined in any of the classes, it is not
+ * hyphenated. It also defines a way to normalize the characters in order to compare them with the
+ * stored patterns. Usually pattern files use only lower case characters, in this case a class for
+ * letter 'a', for example, should be defined as "aA", the first character being the normalization
+ * char.
*/
@Override
public void addClass(String chargroup) {
@@ -415,13 +387,11 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
/**
- * Add an exception to the tree. It is used by
- * {@link PatternParser PatternParser} class as callback to store the
- * hyphenation exceptions.
- *
+ * Add an exception to the tree. It is used by {@link PatternParser PatternParser} class as
+ * callback to store the hyphenation exceptions.
+ *
* @param word normalized word
- * @param hyphenatedword a vector of alternating strings and
- * {@link Hyphen hyphen} objects.
+ * @param hyphenatedword a vector of alternating strings and {@link Hyphen hyphen} objects.
*/
@Override
public void addException(String word, ArrayList<Object> hyphenatedword) {
@@ -429,14 +399,13 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
}
/**
- * Add a pattern to the tree. Mainly, to be used by
- * {@link PatternParser PatternParser} class as callback to add a pattern to
- * the tree.
- *
+ * Add a pattern to the tree. Mainly, to be used by {@link PatternParser PatternParser} class as
+ * callback to add a pattern to the tree.
+ *
* @param pattern the hyphenation pattern
- * @param ivalue interletter weight values indicating the desirability and
- * priority of hyphenating at a given point within the pattern. It
- * should contain only digit characters. (i.e. '0' to '9').
+ * @param ivalue interletter weight values indicating the desirability and priority of hyphenating
+ * at a given point within the pattern. It should contain only digit characters. (i.e. '0' to
+ * '9').
*/
@Override
public void addPattern(String pattern, String ivalue) {
@@ -450,9 +419,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
@Override
public void printStats(PrintStream out) {
- out.println("Value space size = "
- + Integer.toString(vspace.length()));
+ out.println("Value space size = " + Integer.toString(vspace.length()));
super.printStats(out);
-
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
index 358841c..abe5763 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
@@ -19,36 +19,33 @@ package org.apache.lucene.analysis.compound.hyphenation;
import java.util.ArrayList;
/**
- * This interface is used to connect the XML pattern file parser to the
- * hyphenation tree.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * This interface is used to connect the XML pattern file parser to the hyphenation tree.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
public interface PatternConsumer {
/**
- * Add a character class. A character class defines characters that are
- * considered equivalent for the purpose of hyphenation (e.g. "aA"). It
- * usually means to ignore case.
- *
+ * Add a character class. A character class defines characters that are considered equivalent for
+ * the purpose of hyphenation (e.g. "aA"). It usually means to ignore case.
+ *
* @param chargroup character group
*/
void addClass(String chargroup);
/**
- * Add a hyphenation exception. An exception replaces the result obtained by
- * the algorithm for cases for which this fails or the user wants to provide
- * his own hyphenation. A hyphenatedword is a vector of alternating String's
- * and {@link Hyphen Hyphen} instances
+ * Add a hyphenation exception. An exception replaces the result obtained by the algorithm for
+ * cases for which this fails or the user wants to provide his own hyphenation. A hyphenatedword
+ * is a vector of alternating String's and {@link Hyphen Hyphen} instances
*/
void addException(String word, ArrayList<Object> hyphenatedword);
/**
* Add hyphenation patterns.
- *
+ *
* @param pattern the pattern
* @param values interletter values expressed as a string of digit characters.
*/
void addPattern(String pattern, String values);
-
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
index f5f31d8..33a762b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
@@ -16,25 +16,21 @@
*/
package org.apache.lucene.analysis.compound.hyphenation;
-// SAX
-import org.xml.sax.XMLReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import javax.xml.parsers.SAXParserFactory;
+import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
+import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.Attributes;
-
-// Java
-import java.io.IOException;
-import java.util.ArrayList;
-
-import javax.xml.parsers.SAXParserFactory;
/**
- * A SAX document handler to read and parse hyphenation patterns from a XML
- * file.
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ * A SAX document handler to read and parse hyphenation patterns from a XML file.
+ *
+ * <p>This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/).
+ * They have been slightly modified.
*/
public class PatternParser extends DefaultHandler {
@@ -67,7 +63,6 @@ public class PatternParser extends DefaultHandler {
parser.setErrorHandler(this);
parser.setEntityResolver(this);
hyphenChar = '-'; // default
-
}
public PatternParser(PatternConsumer consumer) {
@@ -81,7 +76,7 @@ public class PatternParser extends DefaultHandler {
/**
* Parses a hyphenation pattern file.
- *
+ *
* @param filename the filename
* @throws IOException In case of an exception while parsing
*/
@@ -91,7 +86,7 @@ public class PatternParser extends DefaultHandler {
/**
* Parses a hyphenation pattern file.
- *
+ *
* @param source the InputSource for the file
* @throws IOException In case of an exception while parsing
*/
@@ -105,7 +100,7 @@ public class PatternParser extends DefaultHandler {
/**
* Creates a SAX parser using JAXP
- *
+ *
* @return the created SAX parser
*/
static XMLReader createParser() {
@@ -242,10 +237,8 @@ public class PatternParser extends DefaultHandler {
@Override
public InputSource resolveEntity(String publicId, String systemId) {
// supply the internal hyphenation.dtd if possible
- if (
- (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
- ("hyphenation-info".equals(publicId))
- ) {
+ if ((systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*"))
+ || ("hyphenation-info".equals(publicId))) {
// System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
}
@@ -257,12 +250,11 @@ public class PatternParser extends DefaultHandler {
//
/**
- * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
- * java.lang.String, java.lang.String, org.xml.sax.Attributes)
+ * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String,
+ * java.lang.String, org.xml.sax.Attributes)
*/
@Override
- public void startElement(String uri, String local, String raw,
- Attributes attrs) {
+ public void startElement(String uri, String local, String raw, Attributes attrs) {
if (local.equals("hyphen-char")) {
String h = attrs.getValue("value");
if (h != null && h.length() == 1) {
@@ -279,16 +271,16 @@ public class PatternParser extends DefaultHandler {
if (token.length() > 0) {
exception.add(token.toString());
}
- exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
- attrs.getValue("post")));
+ exception.add(
+ new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), attrs.getValue("post")));
currElement = ELEM_HYPHEN;
}
token.setLength(0);
}
/**
- * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
- * java.lang.String, java.lang.String)
+ * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String,
+ * java.lang.String)
*/
@Override
@SuppressWarnings({"unchecked", "rawtypes"})
@@ -303,8 +295,7 @@ public class PatternParser extends DefaultHandler {
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
- consumer.addException(getExceptionWord(exception),
- (ArrayList) exception.clone());
+ consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
break;
case ELEM_PATTERNS:
consumer.addPattern(getPattern(word), getInterletterValues(word));
@@ -322,12 +313,9 @@ public class PatternParser extends DefaultHandler {
} else {
currElement = 0;
}
-
}
- /**
- * @see org.xml.sax.ContentHandler#characters(char[], int, int)
- */
+ /** @see org.xml.sax.ContentHandler#characters(char[], int, int) */
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public void characters(char ch[], int start, int length) {
@@ -343,8 +331,7 @@ public class PatternParser extends DefaultHandler {
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
- consumer.addException(getExceptionWord(exception),
- (ArrayList) exception.clone());
+ consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
exception.clear();
break;
case ELEM_PATTERNS:
@@ -353,12 +340,9 @@ public class PatternParser extends DefaultHandler {
}
word = readToken(chars);
}
-
}
- /**
- * Returns a string of the location.
- */
+ /** Returns a string of the location. */
private String getLocationString(SAXParseException ex) {
StringBuilder str = new StringBuilder();
@@ -376,6 +360,5 @@ public class PatternParser extends DefaultHandler {
str.append(ex.getColumnNumber());
return str.toString();
-
} // getLocationString(SAXParseException):String
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
index 6d1d71a..15e9ed8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
@@ -21,92 +21,71 @@ import java.util.Enumeration;
import java.util.Stack;
/**
+ *
+ *
* <h2>Ternary Search Tree.</h2>
- *
- * <p>
- * A ternary search tree is a hybrid between a binary tree and a digital search
- * tree (trie). Keys are limited to strings. A data value of type char is stored
- * in each leaf node. It can be used as an index (or pointer) to the data.
- * Branches that only contain one key are compressed to one node by storing a
- * pointer to the trailer substring of the key. This class is intended to serve
- * as base class or helper class to implement Dictionary collections or the
- * like. Ternary trees have some nice properties as the following: the tree can
- * be traversed in sorted order, partial matches (wildcard) can be implemented,
- * retrieval of all keys within a given distance from the target, etc. The
- * storage requirements are higher than a binary tree but a lot less than a
- * trie. Performance is comparable with a hash table, sometimes it outperforms a
- * hash function (most of the time can determine a miss faster than a hash).
- * </p>
- *
- * <p>
- * The main purpose of this java port is to serve as a base for implementing
- * TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
- * requires from 5000 to 15000 hyphenation patterns which will be keys in this
- * tree. The strings patterns are usually small (from 2 to 5 characters), but
- * each char in the tree is stored in a node. Thus memory usage is the main
- * concern. We will sacrifice 'elegance' to keep memory requirements to the
- * minimum. Using java's char type as pointer (yes, I know pointer it is a
- * forbidden word in java) we can keep the size of the node to be just 8 bytes
- * (3 pointers and the data char). This gives room for about 65000 nodes. In my
- * tests the english patterns took 7694 nodes and the german patterns 10055
- * nodes, so I think we are safe.
- * </p>
- *
- * <p>
- * All said, this is a map with strings as keys and char as value. Pretty
- * limited!. It can be extended to a general map by using the string
- * representation of an object and using the char value as an index to an array
- * that contains the object values.
- * </p>
- *
- * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ *
+ * <p>A ternary search tree is a hybrid between a binary tree and a digital search tree (trie). Keys
+ * are limited to strings. A data value of type char is stored in each leaf node. It can be used as
+ * an index (or pointer) to the data. Branches that only contain one key are compressed to one node
+ * by storing a pointer to the trailer substring of the key. This class is intended to serve as base
+ * class or helper class to implement Dictionary collections or the like. Ternary trees have some
+ * nice properties as the following: the tree can be traversed in sorted order, partial matches
+ * (wildcard) can be implemented, retrieval of all keys within a given distance from the target,
+ * etc. The storage requirements are higher than a binary tree but a lot less than a trie.
+ * Performance is comparable with a hash table, sometimes it outperforms a hash function (most of
+ * the time can determine a miss faster than a hash).
+ *
+ * <p>The main purpose of this java port is to serve as a base for implementing TeX's hyphenation
+ * algorithm (see The TeXBook, appendix H). Each language requires from 5000 to 15000 hyphenation
+ * patterns which will be keys in this tree. The strings patterns are usually small (from 2 to 5
+ * characters), but each char in the tree is stored in a node. Thus memory usage is the main
+ * concern. We will sacrifice 'elegance' to keep memory requirements to the minimum. Using java's
+ * char type as pointer (yes, I know pointer it is a forbidden word in java) we can keep the size of
+ * the node to be just 8 bytes (3 pointers and the data char). This gives room for about 65000
+ * nodes. In my tests the english patterns took 7694 nodes and the german patterns 10055 nodes, so I
+ * think we are safe.
+ *
+ * <p>All said, this is a map with strings as keys and char as value. Pretty limited!. It can be
+ * extended to a general map by using the string representation of an object and using the char
+ * value as an index to an array that contains the object values. This class has been taken from the
+ * Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
-
public class TernaryTree implements Cloneable {
/**
- * We use 4 arrays to represent a node. I guess I should have created a proper
- * node class, but somehow Knuth's pascal code made me forget we now have a
- * portable language with virtual memory management and automatic garbage
- * collection! And now is kind of late, furthermore, if it ain't broken, don't
- * fix it.
+ * We use 4 arrays to represent a node. I guess I should have created a proper node class, but
+ * somehow Knuth's pascal code made me forget we now have a portable language with virtual memory
+ * management and automatic garbage collection! And now is kind of late, furthermore, if it ain't
+ * broken, don't fix it.
*/
/**
- * Pointer to low branch and to rest of the key when it is stored directly in
- * this node, we don't have unions in java!
+ * Pointer to low branch and to rest of the key when it is stored directly in this node, we don't
+ * have unions in java!
*/
protected char[] lo;
- /**
- * Pointer to high branch.
- */
+ /** Pointer to high branch. */
protected char[] hi;
- /**
- * Pointer to equal branch and to data when this node is a string terminator.
- */
+ /** Pointer to equal branch and to data when this node is a string terminator. */
protected char[] eq;
/**
- * <P>
- * The character stored in this node: splitchar. Two special values are
- * reserved:
- * </P>
+ * The character stored in this node: splitchar. Two special values are reserved:
+ *
* <ul>
- * <li>0x0000 as string terminator</li>
- * <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
+ * <li>0x0000 as string terminator
+ * <li>0xFFFF to indicate that the branch starting at this node is compressed
* </ul>
- * <p>
- * This shouldn't be a problem if we give the usual semantics to strings since
- * 0xFFFF is guaranteed not to be an Unicode character.
- * </p>
+ *
+ * <p>This shouldn't be a problem if we give the usual semantics to strings since 0xFFFF is
+ * guaranteed not to be an Unicode character.
*/
protected char[] sc;
- /**
- * This vector holds the trailing of the keys when the branch is compressed.
- */
+ /** This vector holds the trailing of the keys when the branch is compressed. */
protected CharVector kv;
protected char root;
@@ -133,10 +112,9 @@ public class TernaryTree implements Cloneable {
}
/**
- * Branches are initially compressed, needing one node per key plus the size
- * of the string key. They are decompressed as needed when another key with
- * same prefix is inserted. This saves a lot of space, specially for long
- * keys.
+ * Branches are initially compressed, needing one node per key plus the size of the string key.
+ * They are decompressed as needed when another key with same prefix is inserted. This saves a lot
+ * of space, specially for long keys.
*/
public void insert(String key, char val) {
// make sure we have enough room in the arrays
@@ -158,9 +136,7 @@ public class TernaryTree implements Cloneable {
root = insert(root, key, start, val);
}
- /**
- * The actual insertion function, recursive version.
- */
+ /** The actual insertion function, recursive version. */
private char insert(char p, char[] key, int start, char val) {
int len = strlen(key, start);
if (p == 0) {
@@ -230,9 +206,7 @@ public class TernaryTree implements Cloneable {
return p;
}
- /**
- * Compares 2 null terminated char arrays
- */
+ /** Compares 2 null terminated char arrays */
public static int strcmp(char[] a, int startA, char[] b, int startB) {
for (; a[startA] == b[startB]; startA++, startB++) {
if (a[startA] == 0) {
@@ -242,9 +216,7 @@ public class TernaryTree implements Cloneable {
return a[startA] - b[startB];
}
- /**
- * Compares a string with null terminated char array
- */
+ /** Compares a string with null terminated char array */
public static int strcmp(String str, char[] a, int start) {
int i, d, len = str.length();
for (i = 0; i < len; i++) {
@@ -260,7 +232,6 @@ public class TernaryTree implements Cloneable {
return -a[start + i];
}
return 0;
-
}
public static void strcpy(char[] dst, int di, char[] src, int si) {
@@ -363,9 +334,9 @@ public class TernaryTree implements Cloneable {
}
/**
- * Recursively insert the median first and then the median of the lower and
- * upper halves, and so on in order to get a balanced tree. The array of keys
- * is assumed to be sorted in ascending order.
+ * Recursively insert the median first and then the median of the lower and upper halves, and so
+ * on in order to get a balanced tree. The array of keys is assumed to be sorted in ascending
+ * order.
*/
protected void insertBalanced(String[] k, char[] v, int offset, int n) {
int m;
@@ -380,9 +351,7 @@ public class TernaryTree implements Cloneable {
insertBalanced(k, v, offset + m + 1, n - m - 1);
}
- /**
- * Balance the tree for best search performance
- */
+ /** Balance the tree for best search performance */
public void balance() {
// System.out.print("Before root splitchar = ");
// System.out.println(sc[root]);
@@ -404,15 +373,12 @@ public class TernaryTree implements Cloneable {
}
/**
- * Each node stores a character (splitchar) which is part of some key(s). In a
- * compressed branch (one that only contain a single string key) the trailer
- * of the key which is not already in nodes is stored externally in the kv
- * array. As items are inserted, key substrings decrease. Some substrings may
- * completely disappear when the whole branch is totally decompressed. The
- * tree is traversed to find the key substrings actually used. In addition,
- * duplicate substrings are removed using a map (implemented with a
- * TernaryTree!).
- *
+ * Each node stores a character (splitchar) which is part of some key(s). In a compressed branch
+ * (one that only contain a single string key) the trailer of the key which is not already in
+ * nodes is stored externally in the kv array. As items are inserted, key substrings decrease.
+ * Some substrings may completely disappear when the whole branch is totally decompressed. The
+ * tree is traversed to find the key substrings actually used. In addition, duplicate substrings
+ * are removed using a map (implemented with a TernaryTree!).
*/
public void trimToSize() {
// first balance the tree for best performance
@@ -458,18 +424,15 @@ public class TernaryTree implements Cloneable {
/**
* Enumeration over TST keys
+ *
* @lucene.internal
*/
public class Iterator implements Enumeration<String> {
- /**
- * current node index
- */
+ /** current node index */
int cur;
- /**
- * current key
- */
+ /** current key */
String curkey;
... 125364 lines suppressed ...