You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2012/09/21 19:22:27 UTC
svn commit: r1388574 [6/45] - in /lucene/dev/branches/LUCENE-2878: ./
dev-tools/ dev-tools/eclipse/ dev-tools/eclipse/dot.settings/
dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/ dev-tools/idea/lucene/analy...
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Fri Sep 21 17:21:34 2012
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
+// Generated using ICU4J 49.1.0.0 on Wednesday, September 19, 2012 10:23:34 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Sep 21 17:21:34 2012
@@ -183,8 +183,7 @@ public final class StandardTokenizer ext
}
@Override
- public void setReader(Reader reader) throws IOException {
- super.setReader(reader);
- scanner.yyreset(reader);
+ public void reset() throws IOException {
+ scanner.yyreset(input);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Fri Sep 21 17:21:34 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
package org.apache.lucene.analysis.standard;
@@ -936,7 +936,7 @@ public final class StandardTokenizerImpl
}
}
- // numRead < 0
+ // numRead < 0
return true;
}
@@ -1157,36 +1157,36 @@ public final class StandardTokenizerImpl
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 2:
- { return WORD_TYPE;
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 9: break;
- case 5:
- { return SOUTH_EAST_ASIAN_TYPE;
+ case 2:
+ { return WORD_TYPE;
}
case 10: break;
- case 4:
- { return KATAKANA_TYPE;
+ case 3:
+ { return NUMERIC_TYPE;
}
case 11: break;
- case 6:
- { return IDEOGRAPHIC_TYPE;
+ case 4:
+ { return KATAKANA_TYPE;
}
case 12: break;
- case 8:
- { return HANGUL_TYPE;
+ case 5:
+ { return SOUTH_EAST_ASIAN_TYPE;
}
case 13: break;
- case 3:
- { return NUMERIC_TYPE;
+ case 6:
+ { return IDEOGRAPHIC_TYPE;
}
case 14: break;
case 7:
{ return HIRAGANA_TYPE;
}
case 15: break;
- case 1:
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ case 8:
+ { return HANGUL_TYPE;
}
case 16: break;
default:
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Fri Sep 21 17:21:34 2012
@@ -115,8 +115,8 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
%%
-// UAX#29 WB1. sot ÷
-// WB2. ÷ eot
+// UAX#29 WB1. sot ÷
+// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
@@ -166,7 +166,7 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
-// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java Fri Sep 21 17:21:34 2012
@@ -162,8 +162,7 @@ public final class UAX29URLEmailTokenize
}
@Override
- public void setReader(Reader reader) throws IOException {
- super.setReader(reader);
- scanner.yyreset(reader);
+ public void reset() throws IOException {
+ scanner.yyreset(input);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java Fri Sep 21 17:21:34 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
package org.apache.lucene.analysis.standard;
@@ -4126,7 +4126,7 @@ public final class UAX29URLEmailTokenize
}
}
- // numRead < 0
+ // numRead < 0
return true;
}
@@ -4347,50 +4347,50 @@ public final class UAX29URLEmailTokenize
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 11:
- // lookahead expression with fixed base length
- zzMarkedPos = zzStartRead + 6;
- { return WORD_TYPE;
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 12: break;
case 2:
{ return WORD_TYPE;
}
case 13: break;
- case 5:
- { return SOUTH_EAST_ASIAN_TYPE;
+ case 3:
+ { return NUMERIC_TYPE;
}
case 14: break;
- case 1:
- { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ case 4:
+ { return KATAKANA_TYPE;
}
case 15: break;
- case 10:
- { return URL_TYPE;
+ case 5:
+ { return SOUTH_EAST_ASIAN_TYPE;
}
case 16: break;
- case 9:
- { return EMAIL_TYPE;
+ case 6:
+ { return IDEOGRAPHIC_TYPE;
}
case 17: break;
- case 4:
- { return KATAKANA_TYPE;
+ case 7:
+ { return HIRAGANA_TYPE;
}
case 18: break;
- case 6:
- { return IDEOGRAPHIC_TYPE;
- }
- case 19: break;
case 8:
{ return HANGUL_TYPE;
}
+ case 19: break;
+ case 9:
+ { return EMAIL_TYPE;
+ }
case 20: break;
- case 3:
- { return NUMERIC_TYPE;
+ case 10:
+ { return URL_TYPE;
}
case 21: break;
- case 7:
- { return HIRAGANA_TYPE;
+ case 11:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 6;
+ { return WORD_TYPE;
}
case 22: break;
default:
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Fri Sep 21 17:21:34 2012
@@ -200,8 +200,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
%%
-// UAX#29 WB1. sot ÷
-// WB2. ÷ eot
+// UAX#29 WB1. sot ÷
+// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
@@ -258,7 +258,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
-// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java Fri Sep 21 17:21:34 2012
@@ -227,8 +227,8 @@ public class SynonymMap {
/**
* Add a phrase->phrase synonym mapping.
* Phrases are character sequences where words are
- * separated with character zero (\u0000). Empty words
- * (two \u0000s in a row) are not allowed in the input nor
+ * separated with character zero (U+0000). Empty words
+ * (two U+0000s in a row) are not allowed in the input nor
* the output!
*
* @param input input phrase
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java Fri Sep 21 17:21:34 2012
@@ -37,6 +37,15 @@ import java.util.regex.PatternSyntaxExce
/**
* Abstract parent class for analysis factories {@link TokenizerFactory},
* {@link TokenFilterFactory} and {@link CharFilterFactory}.
+ * <p>
+ * The typical lifecycle for a factory consumer is:
+ * <ol>
+ * <li>Create factory via its a no-arg constructor
+ * <li>Set version emulation by calling {@link #setLuceneMatchVersion(Version)}
+ * <li>Calls {@link #init(Map)} passing arguments as key-value mappings.
+ * <li>(Optional) If the factory uses resources such as files, {@link ResourceLoaderAware#inform(ResourceLoader)} is called to initialize those resources.
+ * <li>Consumer calls create() to obtain instances.
+ * </ol>
*/
public abstract class AbstractAnalysisFactory {
@@ -46,6 +55,9 @@ public abstract class AbstractAnalysisFa
/** the luceneVersion arg */
protected Version luceneMatchVersion = null;
+ /**
+ * Initialize this factory via a set of key-value pairs.
+ */
public void init(Map<String,String> args) {
this.args = args;
}
@@ -104,6 +116,9 @@ public abstract class AbstractAnalysisFa
return Boolean.parseBoolean(s);
}
+ /**
+ * Compiles a pattern for the value of the specified argument key <code>name</code>
+ */
protected Pattern getPattern(String name) {
try {
String pat = args.get(name);
@@ -118,6 +133,10 @@ public abstract class AbstractAnalysisFa
}
}
+ /**
+ * Returns as {@link CharArraySet} from wordFiles, which
+ * can be a comma-separated list of filenames
+ */
protected CharArraySet getWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
@@ -137,6 +156,9 @@ public abstract class AbstractAnalysisFa
return words;
}
+ /**
+ * Returns the resource's lines (with content treated as UTF-8)
+ */
protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java Fri Sep 21 17:21:34 2012
@@ -78,7 +78,8 @@ public abstract class CharTokenizer exte
charUtils = CharacterUtils.getInstance(matchVersion);
}
- private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
+ // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
+ private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
@@ -162,8 +163,7 @@ public abstract class CharTokenizer exte
}
@Override
- public void setReader(Reader input) throws IOException {
- super.setReader(input);
+ public void reset() throws IOException {
bufferIndex = 0;
offset = 0;
dataLen = 0;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java Fri Sep 21 17:21:34 2012
@@ -34,7 +34,7 @@ import org.apache.lucene.analysis.fr.Fre
* </fieldType></pre>
*
*/
-public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware, MultiTermAwareComponent {
private CharArraySet articles;
@@ -53,5 +53,10 @@ public class ElisionFilterFactory extend
public ElisionFilter create(TokenStream input) {
return new ElisionFilter(input, articles);
}
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return this;
+ }
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoaderAware.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoaderAware.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoaderAware.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoaderAware.java Fri Sep 21 17:21:34 2012
@@ -27,5 +27,9 @@ import java.io.IOException;
*/
public interface ResourceLoaderAware {
+ /**
+ * Initializes this component with the provided ResourceLoader
+ * (used for loading classes, files, etc).
+ */
void inform(ResourceLoader loader) throws IOException;
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java Fri Sep 21 17:21:34 2012
@@ -19,6 +19,9 @@ package org.apache.lucene.analysis.util;
/** Some commonly-used stemming functions */
public class StemmerUtil {
+ /** no instance */
+ private StemmerUtil() {}
+
/**
* Returns true if the character array starts with the suffix.
*
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java Fri Sep 21 17:21:34 2012
@@ -36,7 +36,10 @@ import org.apache.lucene.util.Version;
*/
public class WordlistLoader {
- private static final int INITITAL_CAPACITY = 16;
+ private static final int INITIAL_CAPACITY = 16;
+
+ /** no instance */
+ private WordlistLoader() {}
/**
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
@@ -74,7 +77,7 @@ public class WordlistLoader {
* @return A {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
- return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+ return getWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
}
/**
@@ -89,7 +92,7 @@ public class WordlistLoader {
* @return A CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
- return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+ return getWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
}
/**
@@ -171,7 +174,7 @@ public class WordlistLoader {
* @return A {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
- return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+ return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java Fri Sep 21 17:21:34 2012
@@ -318,19 +318,13 @@ public final class WikipediaTokenizer ex
*/
@Override
public void reset() throws IOException {
- super.reset();
+ scanner.yyreset(input);
tokens = null;
scanner.reset();
first = true;
}
@Override
- public void setReader(Reader reader) throws IOException {
- super.setReader(reader);
- scanner.yyreset(input);
- }
-
- @Override
public void end() {
// set final offset
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Fri Sep 21 17:21:34 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/19/12 6:23 PM */
package org.apache.lucene.analysis.wikipedia;
@@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 8/6/12 11:57 AM from the specification file
- * <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 9/19/12 6:23 PM from the specification file
+ * <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@@ -37,16 +37,16 @@ class WikipediaTokenizerImpl {
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
- public static final int THREE_SINGLE_QUOTES_STATE = 10;
+ public static final int YYINITIAL = 0;
+ public static final int CATEGORY_STATE = 2;
+ public static final int INTERNAL_LINK_STATE = 4;
public static final int EXTERNAL_LINK_STATE = 6;
+ public static final int TWO_SINGLE_QUOTES_STATE = 8;
+ public static final int THREE_SINGLE_QUOTES_STATE = 10;
+ public static final int FIVE_SINGLE_QUOTES_STATE = 12;
public static final int DOUBLE_EQUALS_STATE = 14;
- public static final int INTERNAL_LINK_STATE = 4;
public static final int DOUBLE_BRACE_STATE = 16;
- public static final int CATEGORY_STATE = 2;
- public static final int YYINITIAL = 0;
public static final int STRING = 18;
- public static final int FIVE_SINGLE_QUOTES_STATE = 12;
- public static final int TWO_SINGLE_QUOTES_STATE = 8;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
@@ -589,7 +589,7 @@ final void reset() {
}
}
- // numRead < 0
+ // numRead < 0
return true;
}
@@ -810,188 +810,188 @@ final void reset() {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 44:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
+ case 1:
+ { numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 47: break;
- case 37:
- { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 2:
+ { positionInc = 1; return ALPHANUM;
}
case 48: break;
- case 16:
- { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
+ case 3:
+ { positionInc = 1; return CJ;
}
case 49: break;
- case 20:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 4:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 50: break;
- case 40:
- { positionInc = 1; return ACRONYM;
- }
- case 51: break;
case 5:
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
+ case 51: break;
+ case 6:
+ { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
+ }
case 52: break;
- case 36:
- { positionInc = 1; return COMPANY;
+ case 7:
+ { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
case 53: break;
- case 10:
- { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ case 8:
+ { /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 54: break;
- case 15:
- { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
+ case 9:
+ { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
case 55: break;
- case 22:
- { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
+ case 10:
+ { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 56: break;
- case 35:
- { positionInc = 1; return NUM;
+ case 11:
+ { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 57: break;
- case 33:
- { positionInc = 1; return APOSTROPHE;
+ case 12:
+ { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
case 58: break;
- case 21:
- { yybegin(STRING); return currentTokType;/*pipe*/
+ case 13:
+ { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 59: break;
- case 18:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
+ case 14:
+ { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
case 60: break;
- case 2:
- { positionInc = 1; return ALPHANUM;
+ case 15:
+ { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
}
case 61: break;
- case 1:
- { numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+ case 16:
+ { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
case 62: break;
case 17:
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
}
case 63: break;
- case 39:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
+ case 18:
+ { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
}
case 64: break;
- case 29:
- { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 19:
+ { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
}
case 65: break;
- case 46:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 20:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 66: break;
- case 27:
- { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ case 21:
+ { yybegin(STRING); return currentTokType;/*pipe*/
}
case 67: break;
- case 4:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+ case 22:
+ { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
}
case 68: break;
- case 38:
- { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
+ case 23:
+ { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 69: break;
- case 13:
- { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 24:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 70: break;
- case 3:
- { positionInc = 1; return CJ;
+ case 25:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 71: break;
- case 45:
- { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 26:
+ { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
}
case 72: break;
- case 6:
- { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
+ case 27:
+ { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 73: break;
- case 11:
- { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 28:
+ { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 74: break;
- case 25:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
+ case 29:
+ { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 75: break;
- case 8:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+ case 30:
+ { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 76: break;
- case 19:
- { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
+ case 31:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
}
case 77: break;
- case 43:
- { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+ case 32:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 78: break;
- case 42:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
+ case 33:
+ { positionInc = 1; return APOSTROPHE;
}
case 79: break;
- case 30:
- { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ case 34:
+ { positionInc = 1; return HOST;
}
case 80: break;
- case 14:
- { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
+ case 35:
+ { positionInc = 1; return NUM;
}
case 81: break;
- case 9:
- { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
+ case 36:
+ { positionInc = 1; return COMPANY;
}
case 82: break;
- case 7:
- { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
+ case 37:
+ { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 83: break;
- case 41:
- { positionInc = 1; return EMAIL;
+ case 38:
+ { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
}
case 84: break;
- case 28:
- { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 39:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
}
case 85: break;
- case 23:
- { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
+ case 40:
+ { positionInc = 1; return ACRONYM;
}
case 86: break;
- case 34:
- { positionInc = 1; return HOST;
+ case 41:
+ { positionInc = 1; return EMAIL;
}
case 87: break;
- case 32:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ case 42:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
}
case 88: break;
- case 12:
- { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
+ case 43:
+ { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 89: break;
- case 24:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+ case 44:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 90: break;
- case 31:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
+ case 45:
+ { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 91: break;
- case 26:
- { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
+ case 46:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 92: break;
default:
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex Fri Sep 21 17:21:34 2012
@@ -136,7 +136,7 @@ NUM = ({ALPHANUM} {P} {HAS_DIGIT}
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
// punctuation
-P = ("_"|"-"|"/"|"."|",")
+P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT =
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/collation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/collation/package.html?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/collation/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/collation/package.html Fri Sep 21 17:21:34 2012
@@ -53,17 +53,17 @@
<pre class="prettyprint">
// "fa" Locale is not supported by Sun JDK 1.4 or 1.5
Collator collator = Collator.getInstance(new Locale("ar"));
- CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
+ CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(version, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(version, analyzer));
Document doc = new Document();
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
writer.addDocument(doc);
writer.close();
- IndexSearcher is = new IndexSearcher(ramDir, true);
+ IndexReader ir = DirectoryReader.open(ramDir);
+ IndexSearcher is = new IndexSearcher(ir);
- QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+ QueryParser aqp = new QueryParser(version, "content", analyzer);
aqp.setAnalyzeRangeTerms(true);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
@@ -79,20 +79,21 @@
<h3>Danish Sorting</h3>
<pre class="prettyprint">
Analyzer analyzer
- = new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(version, Collator.getInstance(new Locale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
+ IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(version, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
for (int i = 0 ; i < data.length ; ++i) {
Document doc = new Document();
- doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO));
- doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED));
+ doc.add(new StoredField("tracer", tracer[i]));
+ doc.add(new TextField("contents", data[i], Field.Store.NO));
writer.addDocument(doc);
}
writer.close();
- IndexSearcher searcher = new IndexSearcher(indexStore, true);
+ IndexReader ir = DirectoryReader.open(indexStore);
+ IndexSearcher searcher = new IndexSearcher(ir);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@@ -107,15 +108,16 @@
<pre class="prettyprint">
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
+ Analyzer analyzer = new CollationKeyAnalyzer(version, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(version, analyzer));
Document doc = new Document();
- doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
+ doc.add(new TextField("contents", "DIGY", Field.Store.NO));
writer.addDocument(doc);
writer.close();
- IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
+ IndexReader ir = DirectoryReader.open(ramDir);
+ IndexSearcher is = new IndexSearcher(ir);
+ QueryParser parser = new QueryParser(version, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/Among.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/Among.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/Among.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/Among.java Fri Sep 21 17:21:34 2012
@@ -43,25 +43,26 @@ import java.lang.reflect.Method;
* reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
*/
public class Among {
- private static final Class<?>[] EMPTY_PARAMS = new Class[0];
- public Among (String s, int substring_i, int result,
- String methodname, SnowballProgram methodobject) {
- this.s_size = s.length();
- this.s = s.toCharArray();
- this.substring_i = substring_i;
- this.result = result;
- this.methodobject = methodobject;
- if (methodname.length() == 0) {
- this.method = null;
- } else {
- try {
- this.method = methodobject.getClass().
- getDeclaredMethod(methodname, EMPTY_PARAMS);
- } catch (NoSuchMethodException e) {
- throw new RuntimeException(e);
- }
- }
+ private static final Class<?>[] EMPTY_PARAMS = new Class[0];
+
+ public Among(String s, int substring_i, int result,
+ String methodname, SnowballProgram methodobject) {
+ this.s_size = s.length();
+ this.s = s.toCharArray();
+ this.substring_i = substring_i;
+ this.result = result;
+ this.methodobject = methodobject;
+ if (methodname.length() == 0) {
+ this.method = null;
+ } else {
+ try {
+ this.method = methodobject.getClass().
+ getDeclaredMethod(methodname, EMPTY_PARAMS);
+ } catch (NoSuchMethodException e) {
+ throw new RuntimeException(e);
+ }
}
+ }
public final int s_size; /* search string */
public final char[] s; /* search string */
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/SnowballProgram.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/SnowballProgram.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/SnowballProgram.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/tartarus/snowball/SnowballProgram.java Fri Sep 21 17:21:34 2012
@@ -51,8 +51,8 @@ public abstract class SnowballProgram {
protected SnowballProgram()
{
- current = new char[8];
- setCurrent("");
+ current = new char[8];
+ setCurrent("");
}
public abstract boolean stem();
@@ -62,12 +62,12 @@ public abstract class SnowballProgram {
*/
public void setCurrent(String value)
{
- current = value.toCharArray();
- cursor = 0;
- limit = value.length();
- limit_backward = 0;
- bra = cursor;
- ket = limit;
+ current = value.toCharArray();
+ cursor = 0;
+ limit = value.length();
+ limit_backward = 0;
+ bra = cursor;
+ ket = limit;
}
/**
@@ -130,354 +130,350 @@ public abstract class SnowballProgram {
protected void copy_from(SnowballProgram other)
{
- current = other.current;
- cursor = other.cursor;
- limit = other.limit;
- limit_backward = other.limit_backward;
- bra = other.bra;
- ket = other.ket;
+ current = other.current;
+ cursor = other.cursor;
+ limit = other.limit;
+ limit_backward = other.limit_backward;
+ bra = other.bra;
+ ket = other.ket;
}
protected boolean in_grouping(char [] s, int min, int max)
{
- if (cursor >= limit) return false;
- char ch = current[cursor];
- if (ch > max || ch < min) return false;
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
- cursor++;
- return true;
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor++;
+ return true;
}
protected boolean in_grouping_b(char [] s, int min, int max)
{
- if (cursor <= limit_backward) return false;
- char ch = current[cursor - 1];
- if (ch > max || ch < min) return false;
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
- cursor--;
- return true;
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor--;
+ return true;
}
protected boolean out_grouping(char [] s, int min, int max)
{
- if (cursor >= limit) return false;
- char ch = current[cursor];
- if (ch > max || ch < min) {
- cursor++;
- return true;
- }
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
- cursor ++;
- return true;
- }
- return false;
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) {
+ cursor++;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor ++;
+ return true;
+ }
+ return false;
}
protected boolean out_grouping_b(char [] s, int min, int max)
{
- if (cursor <= limit_backward) return false;
- char ch = current[cursor - 1];
- if (ch > max || ch < min) {
- cursor--;
- return true;
- }
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
- cursor--;
- return true;
- }
- return false;
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) {
+ cursor--;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor--;
+ return true;
+ }
+ return false;
}
protected boolean in_range(int min, int max)
{
- if (cursor >= limit) return false;
- char ch = current[cursor];
- if (ch > max || ch < min) return false;
- cursor++;
- return true;
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) return false;
+ cursor++;
+ return true;
}
protected boolean in_range_b(int min, int max)
{
- if (cursor <= limit_backward) return false;
- char ch = current[cursor - 1];
- if (ch > max || ch < min) return false;
- cursor--;
- return true;
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) return false;
+ cursor--;
+ return true;
}
protected boolean out_range(int min, int max)
{
- if (cursor >= limit) return false;
- char ch = current[cursor];
- if (!(ch > max || ch < min)) return false;
- cursor++;
- return true;
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (!(ch > max || ch < min)) return false;
+ cursor++;
+ return true;
}
protected boolean out_range_b(int min, int max)
{
- if (cursor <= limit_backward) return false;
- char ch = current[cursor - 1];
- if(!(ch > max || ch < min)) return false;
- cursor--;
- return true;
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if(!(ch > max || ch < min)) return false;
+ cursor--;
+ return true;
}
protected boolean eq_s(int s_size, CharSequence s)
{
- if (limit - cursor < s_size) return false;
- int i;
- for (i = 0; i != s_size; i++) {
- if (current[cursor + i] != s.charAt(i)) return false;
- }
- cursor += s_size;
- return true;
+ if (limit - cursor < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current[cursor + i] != s.charAt(i)) return false;
+ }
+ cursor += s_size;
+ return true;
}
protected boolean eq_s_b(int s_size, CharSequence s)
{
- if (cursor - limit_backward < s_size) return false;
- int i;
- for (i = 0; i != s_size; i++) {
- if (current[cursor - s_size + i] != s.charAt(i)) return false;
- }
- cursor -= s_size;
- return true;
+ if (cursor - limit_backward < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current[cursor - s_size + i] != s.charAt(i)) return false;
+ }
+ cursor -= s_size;
+ return true;
}
protected boolean eq_v(CharSequence s)
{
- return eq_s(s.length(), s);
+ return eq_s(s.length(), s);
}
protected boolean eq_v_b(CharSequence s)
- { return eq_s_b(s.length(), s);
+ {
+ return eq_s_b(s.length(), s);
}
protected int find_among(Among v[], int v_size)
{
- int i = 0;
- int j = v_size;
+ int i = 0;
+ int j = v_size;
+
+ int c = cursor;
+ int l = limit;
- int c = cursor;
- int l = limit;
+ int common_i = 0;
+ int common_j = 0;
- int common_i = 0;
- int common_j = 0;
+ boolean first_key_inspected = false;
+
+ while (true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j; // smaller
+ Among w = v[k];
+ int i2;
+ for (i2 = common; i2 < w.s_size; i2++) {
+ if (c + common == l) {
+ diff = -1;
+ break;
+ }
+ diff = current[c + common] - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break; // v->s has been inspected
+ if (j == i) break; // only one item in v
+
+ // - but now we need to go round once more to get
+ // v->s inspected. This looks messy, but is actually
+ // the optimal approach.
- boolean first_key_inspected = false;
-
- while(true) {
- int k = i + ((j - i) >> 1);
- int diff = 0;
- int common = common_i < common_j ? common_i : common_j; // smaller
- Among w = v[k];
- int i2;
- for (i2 = common; i2 < w.s_size; i2++) {
- if (c + common == l) {
- diff = -1;
- break;
- }
- diff = current[c + common] - w.s[i2];
- if (diff != 0) break;
- common++;
- }
- if (diff < 0) {
- j = k;
- common_j = common;
- } else {
- i = k;
- common_i = common;
- }
- if (j - i <= 1) {
- if (i > 0) break; // v->s has been inspected
- if (j == i) break; // only one item in v
-
- // - but now we need to go round once more to get
- // v->s inspected. This looks messy, but is actually
- // the optimal approach.
-
- if (first_key_inspected) break;
- first_key_inspected = true;
- }
- }
- while(true) {
- Among w = v[i];
- if (common_i >= w.s_size) {
- cursor = c + w.s_size;
- if (w.method == null) return w.result;
- boolean res;
- try {
- Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
- res = resobj.toString().equals("true");
- } catch (InvocationTargetException e) {
- res = false;
- // FIXME - debug message
- } catch (IllegalAccessException e) {
- res = false;
- // FIXME - debug message
- }
- cursor = c + w.s_size;
- if (res) return w.result;
- }
- i = w.substring_i;
- if (i < 0) return 0;
- }
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while (true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c + w.s_size;
+ if (w.method == null) return w.result;
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c + w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
}
- // find_among_b is for backwards processing. Same comments apply
+ // find_among_b is for backwards processing. Same comments apply
protected int find_among_b(Among v[], int v_size)
{
- int i = 0;
- int j = v_size;
+ int i = 0;
+ int j = v_size;
- int c = cursor;
- int lb = limit_backward;
+ int c = cursor;
+ int lb = limit_backward;
- int common_i = 0;
- int common_j = 0;
+ int common_i = 0;
+ int common_j = 0;
- boolean first_key_inspected = false;
-
- while(true) {
- int k = i + ((j - i) >> 1);
- int diff = 0;
- int common = common_i < common_j ? common_i : common_j;
- Among w = v[k];
- int i2;
- for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
- if (c - common == lb) {
- diff = -1;
- break;
- }
- diff = current[c - 1 - common] - w.s[i2];
- if (diff != 0) break;
- common++;
- }
- if (diff < 0) {
- j = k;
- common_j = common;
- } else {
- i = k;
- common_i = common;
- }
- if (j - i <= 1) {
- if (i > 0) break;
- if (j == i) break;
- if (first_key_inspected) break;
- first_key_inspected = true;
- }
- }
- while(true) {
- Among w = v[i];
- if (common_i >= w.s_size) {
- cursor = c - w.s_size;
- if (w.method == null) return w.result;
-
- boolean res;
- try {
- Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
- res = resobj.toString().equals("true");
- } catch (InvocationTargetException e) {
- res = false;
- // FIXME - debug message
- } catch (IllegalAccessException e) {
- res = false;
- // FIXME - debug message
- }
- cursor = c - w.s_size;
- if (res) return w.result;
- }
- i = w.substring_i;
- if (i < 0) return 0;
- }
+ boolean first_key_inspected = false;
+
+ while (true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j;
+ Among w = v[k];
+ int i2;
+ for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
+ if (c - common == lb) {
+ diff = -1;
+ break;
+ }
+ diff = current[c - 1 - common] - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break;
+ if (j == i) break;
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while (true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c - w.s_size;
+ if (w.method == null) return w.result;
+
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c - w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
}
- /* to replace chars between c_bra and c_ket in current by the
+ /* to replace chars between c_bra and c_ket in current by the
* chars in s.
*/
- protected int replace_s(int c_bra, int c_ket, CharSequence s)
- {
- final int adjustment = s.length() - (c_ket - c_bra);
- final int newLength = limit + adjustment;
- //resize if necessary
- if (newLength > current.length) {
- char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
- System.arraycopy(current, 0, newBuffer, 0, limit);
- current = newBuffer;
- }
- // if the substring being replaced is longer or shorter than the
- // replacement, need to shift things around
- if (adjustment != 0 && c_ket < limit) {
- System.arraycopy(current, c_ket, current, c_bra + s.length(),
- limit - c_ket);
- }
- // insert the replacement text
- // Note, faster is s.getChars(0, s.length(), current, c_bra);
- // but would have to duplicate this method for both String and StringBuilder
- for (int i = 0; i < s.length(); i++)
- current[c_bra + i] = s.charAt(i);
-
- limit += adjustment;
- if (cursor >= c_ket) cursor += adjustment;
- else if (cursor > c_bra) cursor = c_bra;
- return adjustment;
- }
-
- protected void slice_check()
- {
- if (bra < 0 ||
- bra > ket ||
- ket > limit)
- {
- throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
- // FIXME: report error somehow.
- /*
- fprintf(stderr, "faulty slice operation:\n");
- debug(z, -1, 0);
- exit(1);
- */
- }
- }
-
- protected void slice_from(CharSequence s)
- {
- slice_check();
- replace_s(bra, ket, s);
- }
-
- protected void slice_del()
- {
- slice_from((CharSequence)"");
- }
-
- protected void insert(int c_bra, int c_ket, CharSequence s)
- {
- int adjustment = replace_s(c_bra, c_ket, s);
- if (c_bra <= bra) bra += adjustment;
- if (c_bra <= ket) ket += adjustment;
+ protected int replace_s(int c_bra, int c_ket, CharSequence s) {
+ final int adjustment = s.length() - (c_ket - c_bra);
+ final int newLength = limit + adjustment;
+ //resize if necessary
+ if (newLength > current.length) {
+ char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
+ System.arraycopy(current, 0, newBuffer, 0, limit);
+ current = newBuffer;
+ }
+ // if the substring being replaced is longer or shorter than the
+ // replacement, need to shift things around
+ if (adjustment != 0 && c_ket < limit) {
+ System.arraycopy(current, c_ket, current, c_bra + s.length(),
+ limit - c_ket);
+ }
+ // insert the replacement text
+ // Note, faster is s.getChars(0, s.length(), current, c_bra);
+ // but would have to duplicate this method for both String and StringBuilder
+ for (int i = 0; i < s.length(); i++)
+ current[c_bra + i] = s.charAt(i);
+
+ limit += adjustment;
+ if (cursor >= c_ket) cursor += adjustment;
+ else if (cursor > c_bra) cursor = c_bra;
+ return adjustment;
+ }
+
+ protected void slice_check() {
+ if (bra < 0 ||
+ bra > ket ||
+ ket > limit) {
+ throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
+ // FIXME: report error somehow.
+ /*
+ fprintf(stderr, "faulty slice operation:\n");
+ debug(z, -1, 0);
+ exit(1);
+ */
+ }
+ }
+
+ protected void slice_from(CharSequence s) {
+ slice_check();
+ replace_s(bra, ket, s);
+ }
+
+ protected void slice_del() {
+ slice_from((CharSequence) "");
+ }
+
+ protected void insert(int c_bra, int c_ket, CharSequence s)
+ {
+ int adjustment = replace_s(c_bra, c_ket, s);
+ if (c_bra <= bra) bra += adjustment;
+ if (c_bra <= ket) ket += adjustment;
}
/* Copy the slice into the supplied StringBuffer */
protected StringBuilder slice_to(StringBuilder s)
{
- slice_check();
- int len = ket - bra;
- s.setLength(0);
- s.append(current, bra, len);
- return s;
+ slice_check();
+ int len = ket - bra;
+ s.setLength(0);
+ s.append(current, bra, len);
+ return s;
}
protected StringBuilder assign_to(StringBuilder s)
{
- s.setLength(0);
- s.append(current, 0, limit);
- return s;
+ s.setLength(0);
+ s.append(current, 0, limit);
+ return s;
}
/*
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Fri Sep 21 17:21:34 2012
@@ -38,87 +38,87 @@ import org.apache.lucene.analysis.util.C
public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
public void testWithSnowballExamples() throws Exception {
- check("boa", "boa");
- check("boainain", "boainain");
- check("boas", "boas");
- check("bôas", "boas"); // removes diacritic: different from snowball portugese
- check("boassu", "boassu");
- check("boataria", "boat");
- check("boate", "boat");
- check("boates", "boat");
- check("boatos", "boat");
- check("bob", "bob");
- check("boba", "bob");
- check("bobagem", "bobag");
- check("bobagens", "bobagens");
- check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
- check("bobear", "bob");
- check("bobeira", "bobeir");
- check("bobinho", "bobinh");
- check("bobinhos", "bobinh");
- check("bobo", "bob");
- check("bobs", "bobs");
- check("boca", "boc");
- check("bocadas", "boc");
- check("bocadinho", "bocadinh");
- check("bocado", "boc");
- check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
- check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
- check("bocarra", "bocarr");
- check("bocas", "boc");
- check("bode", "bod");
- check("bodoque", "bodoqu");
- check("body", "body");
- check("boeing", "boeing");
- check("boem", "boem");
- check("boemia", "boem");
- check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
- check("bogotá", "bogot");
- check("boi", "boi");
- check("bóia", "boi"); // removes diacritic: different from snowball portuguese
- check("boiando", "boi");
- check("quiabo", "quiab");
- check("quicaram", "quic");
- check("quickly", "quickly");
- check("quieto", "quiet");
- check("quietos", "quiet");
- check("quilate", "quilat");
- check("quilates", "quilat");
- check("quilinhos", "quilinh");
- check("quilo", "quil");
- check("quilombo", "quilomb");
- check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
- check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
- check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
- check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
- check("quilos", "quil");
- check("quimica", "quimic");
- check("quilos", "quil");
- check("quimica", "quimic");
- check("quimicas", "quimic");
- check("quimico", "quimic");
- check("quimicos", "quimic");
- check("quimioterapia", "quimioterap");
- check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
- check("quimono", "quimon");
- check("quincas", "quinc");
- check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
- check("quinhentos", "quinhent");
- check("quinn", "quinn");
- check("quino", "quin");
- check("quinta", "quint");
- check("quintal", "quintal");
- check("quintana", "quintan");
- check("quintanilha", "quintanilh");
- check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
- check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
- check("quintino", "quintin");
- check("quinto", "quint");
- check("quintos", "quint");
- check("quintuplicou", "quintuplic");
- check("quinze", "quinz");
- check("quinzena", "quinzen");
- check("quiosque", "quiosqu");
+ check("boa", "boa");
+ check("boainain", "boainain");
+ check("boas", "boas");
+ check("bôas", "boas"); // removes diacritic: different from snowball portugese
+ check("boassu", "boassu");
+ check("boataria", "boat");
+ check("boate", "boat");
+ check("boates", "boat");
+ check("boatos", "boat");
+ check("bob", "bob");
+ check("boba", "bob");
+ check("bobagem", "bobag");
+ check("bobagens", "bobagens");
+ check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
+ check("bobear", "bob");
+ check("bobeira", "bobeir");
+ check("bobinho", "bobinh");
+ check("bobinhos", "bobinh");
+ check("bobo", "bob");
+ check("bobs", "bobs");
+ check("boca", "boc");
+ check("bocadas", "boc");
+ check("bocadinho", "bocadinh");
+ check("bocado", "boc");
+ check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
+ check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
+ check("bocarra", "bocarr");
+ check("bocas", "boc");
+ check("bode", "bod");
+ check("bodoque", "bodoqu");
+ check("body", "body");
+ check("boeing", "boeing");
+ check("boem", "boem");
+ check("boemia", "boem");
+ check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
+ check("bogotá", "bogot");
+ check("boi", "boi");
+ check("bóia", "boi"); // removes diacritic: different from snowball portuguese
+ check("boiando", "boi");
+ check("quiabo", "quiab");
+ check("quicaram", "quic");
+ check("quickly", "quickly");
+ check("quieto", "quiet");
+ check("quietos", "quiet");
+ check("quilate", "quilat");
+ check("quilates", "quilat");
+ check("quilinhos", "quilinh");
+ check("quilo", "quil");
+ check("quilombo", "quilomb");
+ check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
+ check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
+ check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
+ check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
+ check("quilos", "quil");
+ check("quimica", "quimic");
+ check("quilos", "quil");
+ check("quimica", "quimic");
+ check("quimicas", "quimic");
+ check("quimico", "quimic");
+ check("quimicos", "quimic");
+ check("quimioterapia", "quimioterap");
+ check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
+ check("quimono", "quimon");
+ check("quincas", "quinc");
+ check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
+ check("quinhentos", "quinhent");
+ check("quinn", "quinn");
+ check("quino", "quin");
+ check("quinta", "quint");
+ check("quintal", "quintal");
+ check("quintana", "quintan");
+ check("quintanilha", "quintanilh");
+ check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
+ check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
+ check("quintino", "quintin");
+ check("quinto", "quint");
+ check("quintos", "quint");
+ check("quintuplicou", "quintuplic");
+ check("quinze", "quinz");
+ check("quinzena", "quinzen");
+ check("quiosque", "quiosqu");
}
public void testNormalization() throws Exception {
@@ -175,4 +175,4 @@ public class TestBrazilianStemmer extend
};
checkOneTermReuse(a, "", "");
}
-}
\ No newline at end of file
+}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java Fri Sep 21 17:21:34 2012
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFi
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@@ -55,6 +56,11 @@ public class TestMappingCharFilter exten
builder.add( "empty", "" );
+ // BMP (surrogate pair):
+ builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
+
+ builder.add("\uff01", "full-width-exclamation");
+
normMap = builder.build();
}
@@ -128,6 +134,18 @@ public class TestMappingCharFilter exten
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
+ public void testNonBMPChar() throws Exception {
+ CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
+ }
+
+ public void testFullWidthChar() throws Exception {
+ CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
+ }
+
//
// 1111111111222
// 01234567890123456789012
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java Fri Sep 21 17:21:34 2012
@@ -39,6 +39,7 @@ public class CommonGramsFilterTest exten
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
+ cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.toString());
assertTrue(cgf.incrementToken());
@@ -61,6 +62,7 @@ public class CommonGramsFilterTest exten
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
+ nsf.reset();
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.toString());
assertTrue(nsf.incrementToken());
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Fri Sep 21 17:21:34 2012
@@ -235,6 +235,7 @@ public class TestCompoundWordTokenFilter
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
+ tf.reset();
assertTrue(tf.incrementToken());
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
assertTrue(tf.incrementToken());
@@ -256,6 +257,7 @@ public class TestCompoundWordTokenFilter
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
+ stream.reset();
while (stream.incrementToken()) {
assertTrue("Custom attribute value was lost", retAtt.getRetain());
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java Fri Sep 21 17:21:34 2012
@@ -80,6 +80,7 @@ public class TestAnalyzers extends BaseT
void verifyPayload(TokenStream ts) throws IOException {
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
+ ts.reset();
for(byte b=1;;b++) {
boolean hasNext = ts.incrementToken();
if (!hasNext) break;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java Fri Sep 21 17:21:34 2012
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTo
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.StringMockResourceLoader;
@@ -114,11 +115,15 @@ public class TestFactories extends BaseT
}
/** tries to initialize a factory with no arguments */
- private boolean initialize(AbstractAnalysisFactory factory) {
+ private boolean initialize(AbstractAnalysisFactory factory) throws IOException {
boolean success = false;
try {
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(Collections.<String,String>emptyMap());
+ if (factory instanceof ResourceLoaderAware) {
+ ResourceLoaderAware resourceLoaderAware = (ResourceLoaderAware) factory;
+ resourceLoaderAware.inform(new ClasspathResourceLoader(factory.getClass()));
+ }
success = true;
} catch (IllegalArgumentException ignored) {
// its ok if we dont provide the right parameters to throw this
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Fri Sep 21 17:21:34 2012
@@ -782,31 +782,51 @@ public class TestRandomChains extends Ba
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
readSomething = true;
- return in.read(cbuf, off, len);
+ return input.read(cbuf, off, len);
}
@Override
public int read() throws IOException {
readSomething = true;
- return in.read();
+ return input.read();
}
@Override
public int read(CharBuffer target) throws IOException {
readSomething = true;
- return in.read(target);
+ return input.read(target);
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
- return in.read(cbuf);
+ return input.read(cbuf);
}
@Override
public long skip(long n) throws IOException {
readSomething = true;
- return in.skip(n);
+ return input.skip(n);
+ }
+
+ @Override
+ public void mark(int readAheadLimit) throws IOException {
+ input.mark(readAheadLimit);
+ }
+
+ @Override
+ public boolean markSupported() {
+ return input.markSupported();
+ }
+
+ @Override
+ public boolean ready() throws IOException {
+ return input.ready();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java Fri Sep 21 17:21:34 2012
@@ -66,6 +66,7 @@ public class TestStopAnalyzer extends Ba
assertNotNull(stream);
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
+ stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));
@@ -83,6 +84,7 @@ public class TestStopAnalyzer extends Ba
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
+ stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));