You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2012/08/13 15:53:27 UTC
svn commit: r1372423 [5/45] - in /lucene/dev/branches/LUCENE-2878: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/
dev-tools/maven/ dev-tools/maven/lucene/
dev-tools/maven/lucene/analysis/common/
dev-tools/maven/lucene/analysis/icu/ de...
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Mon Aug 13 13:52:46 2012
@@ -141,9 +141,9 @@ InlineElment = ( [aAbBiIqQsSuU]
[vV][aA][rR] )
-%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+%include HTMLCharacterEntities.jflex
-%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java Mon Aug 13 13:52:46 2012
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
@@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
+ * By default, when a CJK character has no adjacent characters to form
+ * a bigram, it is output in unigram form. If you want to always output
+ * both unigrams and bigrams, set the <code>outputUnigrams</code>
+ * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
+ * This can be used for a combined unigram+bigram approach.
+ * <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
@@ -67,10 +75,16 @@ public final class CJKBigramFilter exten
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
+
+ // true if we should output unigram tokens always
+ private final boolean outputUnigrams;
+ private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
@@ -88,23 +102,36 @@ public final class CJKBigramFilter exten
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
- * CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
+ * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
- * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
+ * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+ * CJKBigramFilter(in, flags, false)}
+ */
+ public CJKBigramFilter(TokenStream in, int flags) {
+ this(in, flags, false);
+ }
+
+ /**
+ * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+ * and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+ * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
+ * when this is false, this is only done when there are no adjacent characters to form
+ * a bigram.
*/
- public CJKBigramFilter(TokenStream in, int flags) {
+ public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
+ this.outputUnigrams = outputUnigrams;
}
/*
@@ -120,7 +147,24 @@ public final class CJKBigramFilter exten
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
- flushBigram();
+ if (outputUnigrams) {
+
+ // when also outputting unigrams, we output the unigram first,
+ // then rewind back to revisit the bigram.
+ // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+ // the logic in hasBufferedUnigram ensures we output the C,
+ // even though it did actually have adjacent CJK characters.
+
+ if (ngramState) {
+ flushBigram();
+ } else {
+ flushUnigram();
+ index--;
+ }
+ ngramState = !ngramState;
+ } else {
+ flushBigram();
+ }
return true;
} else if (doNext()) {
@@ -260,6 +304,11 @@ public final class CJKBigramFilter exten
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
+ // when outputting unigrams, all bigrams are synonyms that span two unigrams
+ if (outputUnigrams) {
+ posIncAtt.setPositionIncrement(0);
+ posLengthAtt.setPositionLength(2);
+ }
index++;
}
@@ -292,7 +341,13 @@ public final class CJKBigramFilter exten
* inputs.
*/
private boolean hasBufferedUnigram() {
- return bufferLen == 1 && index == 0;
+ if (outputUnigrams) {
+ // when outputting unigrams always
+ return bufferLen - index == 1;
+ } else {
+ // otherwise its only when we have a lone CJK character
+ return bufferLen == 1 && index == 0;
+ }
}
@Override
@@ -303,5 +358,6 @@ public final class CJKBigramFilter exten
lastEndOffset = 0;
loneState = null;
exhausted = false;
+ ngramState = false;
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java Mon Aug 13 13:52:46 2012
@@ -57,6 +57,9 @@ public class DictionaryCompoundWordToken
*/
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) {
super(matchVersion, input, dictionary);
+ if (dictionary == null) {
+ throw new IllegalArgumentException("dictionary cannot be null");
+ }
}
/**
@@ -83,6 +86,9 @@ public class DictionaryCompoundWordToken
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+ if (dictionary == null) {
+ throw new IllegalArgumentException("dictionary cannot be null");
+ }
}
@Override
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Mon Aug 13 13:52:46 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.compo
*/
import java.io.File;
+import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -131,10 +132,10 @@ public class HyphenationCompoundWordToke
*
* @param hyphenationFilename the filename of the XML grammar to load
* @return An object representing the hyphenation patterns
- * @throws Exception
+ * @throws IOException
*/
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
- throws Exception {
+ throws IOException {
return getHyphenationTree(new InputSource(hyphenationFilename));
}
@@ -143,10 +144,10 @@ public class HyphenationCompoundWordToke
*
* @param hyphenationFile the file of the XML grammar to load
* @return An object representing the hyphenation patterns
- * @throws Exception
+ * @throws IOException
*/
public static HyphenationTree getHyphenationTree(File hyphenationFile)
- throws Exception {
+ throws IOException {
return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
}
@@ -155,10 +156,10 @@ public class HyphenationCompoundWordToke
*
* @param hyphenationSource the InputSource pointing to the XML grammar
* @return An object representing the hyphenation patterns
- * @throws Exception
+ * @throws IOException
*/
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
- throws Exception {
+ throws IOException {
HyphenationTree tree = new HyphenationTree();
tree.loadPatterns(hyphenationSource);
return tree;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java Mon Aug 13 13:52:46 2012
@@ -18,8 +18,8 @@
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.File;
+import java.io.IOException;
import java.io.PrintStream;
-import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
@@ -108,25 +108,20 @@ public class HyphenationTree extends Ter
* Read hyphenation patterns from an XML file.
*
* @param f the filename
- * @throws HyphenationException In case the parsing fails
+ * @throws IOException In case the parsing fails
*/
- public void loadPatterns(File f) throws HyphenationException {
- try {
- InputSource src = new InputSource(f.toURL().toExternalForm());
- loadPatterns(src);
- } catch (MalformedURLException e) {
- throw new HyphenationException("Error converting the File '" + f
- + "' to a URL: " + e.getMessage());
- }
+ public void loadPatterns(File f) throws IOException {
+ InputSource src = new InputSource(f.toURL().toExternalForm());
+ loadPatterns(src);
}
/**
* Read hyphenation patterns from an XML file.
*
* @param source the InputSource for the file
- * @throws HyphenationException In case the parsing fails
+ * @throws IOException In case the parsing fails
*/
- public void loadPatterns(InputSource source) throws HyphenationException {
+ public void loadPatterns(InputSource source) throws IOException {
PatternParser pp = new PatternParser(this);
ivalues = new TernaryTree();
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java Mon Aug 13 13:52:46 2012
@@ -27,9 +27,7 @@ import org.xml.sax.Attributes;
// Java
import java.io.File;
-import java.io.FileNotFoundException;
import java.io.IOException;
-import java.net.MalformedURLException;
import java.util.ArrayList;
import javax.xml.parsers.SAXParserFactory;
@@ -87,9 +85,9 @@ public class PatternParser extends Defau
* Parses a hyphenation pattern file.
*
* @param filename the filename
- * @throws HyphenationException In case of an exception while parsing
+ * @throws IOException In case of an exception while parsing
*/
- public void parse(String filename) throws HyphenationException {
+ public void parse(String filename) throws IOException {
parse(new InputSource(filename));
}
@@ -97,33 +95,24 @@ public class PatternParser extends Defau
* Parses a hyphenation pattern file.
*
* @param file the pattern file
- * @throws HyphenationException In case of an exception while parsing
+ * @throws IOException In case of an exception while parsing
*/
- public void parse(File file) throws HyphenationException {
- try {
- InputSource src = new InputSource(file.toURL().toExternalForm());
- parse(src);
- } catch (MalformedURLException e) {
- throw new HyphenationException("Error converting the File '" + file
- + "' to a URL: " + e.getMessage());
- }
+ public void parse(File file) throws IOException {
+ InputSource src = new InputSource(file.toURL().toExternalForm());
+ parse(src);
}
/**
* Parses a hyphenation pattern file.
*
* @param source the InputSource for the file
- * @throws HyphenationException In case of an exception while parsing
+ * @throws IOException In case of an exception while parsing
*/
- public void parse(InputSource source) throws HyphenationException {
+ public void parse(InputSource source) throws IOException {
try {
parser.parse(source);
- } catch (FileNotFoundException fnfe) {
- throw new HyphenationException("File not found: " + fnfe.getMessage());
- } catch (IOException ioe) {
- throw new HyphenationException(ioe.getMessage());
} catch (SAXException e) {
- throw new HyphenationException(errMsg);
+ throw new IOException(e);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Mon Aug 13 13:52:46 2012
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
@@ -35,6 +36,7 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
/**
* {@link Analyzer} for French language.
@@ -54,6 +56,11 @@ public final class FrenchAnalyzer extend
/** File containing default French stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
+ /** Default set of articles for ElisionFilter */
+ public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
+ "l", "m", "t", "qu", "n", "s", "j"), true));
+
/**
* Contains words that should be indexed but not stemmed.
*/
@@ -134,7 +141,7 @@ public final class FrenchAnalyzer extend
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
- result = new ElisionFilter(matchVersion, result);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java Mon Aug 13 13:52:46 2012
@@ -23,7 +23,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -31,6 +30,7 @@ import org.apache.lucene.analysis.snowba
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.IrishStemmer;
@@ -140,7 +140,7 @@ public final class IrishAnalyzer extends
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
s.setEnablePositionIncrements(false);
result = s;
- result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Mon Aug 13 13:52:46 2012
@@ -24,7 +24,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowba
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
@@ -129,7 +129,7 @@ public final class ItalianAnalyzer exten
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
- result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java Mon Aug 13 13:52:46 2012
@@ -40,6 +40,9 @@ public class NumericPayloadTokenFilter e
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
super(input);
+ if (typeMatch == null) {
+ throw new IllegalArgumentException("typeMatch cannot be null");
+ }
//Need to encode the payload
thePayload = new BytesRef(PayloadHelper.encodeFloat(payload));
this.typeMatch = typeMatch;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro Mon Aug 13 13:52:46 2012
@@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Sunday, March 18, 2012 4:34:02 AM UTC
-// generated on Sunday, March 18, 2012 4:02:55 PM UTC
+// file version from Saturday, July 14, 2012 4:34:14 AM UTC
+// generated on Sunday, July 15, 2012 12:59:44 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@@ -310,6 +310,7 @@ ASCIITLD = "." (
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
+ | [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Mon Aug 13 13:52:46 2012
@@ -1,8 +1,8 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.standard;
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 08.07.12 16:59 from the specification file
- * <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 8/6/12 11:57 AM from the specification file
+ * <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@@ -42,7 +42,7 @@ class ClassicTokenizerImpl implements St
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
- private static final int ZZ_BUFFERSIZE = 16384;
+ private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int YYINITIAL = 0;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex Mon Aug 13 13:52:46 2012
@@ -1,6 +1,6 @@
package org.apache.lucene.analysis.standard;
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokena
%function getNextToken
%pack
%char
+%buffer 4096
%{
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt Mon Aug 13 13:52:46 2012
@@ -18,4 +18,4 @@
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
and need to regenerate the tokenizer, only use the trunk version
- of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
+ of JFlex 1.5 (with a minimum SVN revision 607) at the moment!
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1372423&r1=1372422&r2=1372423&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Mon Aug 13 13:52:46 2012
@@ -14,22 +14,25 @@
* limitations under the License.
*/
-// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
+// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
- ([\ud80d][\uDC00-\uDC2E])
+ ([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
+ | ([\ud81a][\uDC00-\uDE38])
+ | ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
+ | ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
+ | ([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
- | ([\ud81a][\uDC00-\uDE38])
- | ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
- | ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
+ | ([\ud805][\uDE80-\uDEAA])
+ | ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
- | ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
+ | ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
@@ -37,14 +40,17 @@ FormatSupp = (
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
- ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
+ ([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
+ | ([\ud805][\uDEAB-\uDEB7])
+ | ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
- ([\ud804][\uDC66-\uDC6F])
+ ([\ud805][\uDEC0-\uDEC9])
+ | ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)