You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2012/10/25 15:10:51 UTC
svn commit: r1402140 [3/17] - in /lucene/dev/branches/LUCENE-2878: ./
dev-tools/ dev-tools/eclipse/ dev-tools/eclipse/dot.settings/
dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/classification/
dev-tools/maven/ dev-tools/maven/lucene/classifica...
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java Thu Oct 25 13:10:25 2012
@@ -140,11 +140,10 @@ public final class WikipediaTokenizer ex
*
* @param input The input
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
- * @param untokenizedTypes
*/
public WikipediaTokenizer(Reader input, int tokenOutput, Set<String> untokenizedTypes) {
super(input);
- this.scanner = new WikipediaTokenizerImpl(input);
+ this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
init(tokenOutput, untokenizedTypes);
}
@@ -154,11 +153,10 @@ public final class WikipediaTokenizer ex
*
* @param input The input
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
- * @param untokenizedTypes
*/
public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
super(factory, input);
- this.scanner = new WikipediaTokenizerImpl(input);
+ this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
init(tokenOutput, untokenizedTypes);
}
@@ -168,11 +166,10 @@ public final class WikipediaTokenizer ex
*
* @param input The input
* @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
- * @param untokenizedTypes
*/
public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
super(source, input);
- this.scanner = new WikipediaTokenizerImpl(input);
+ this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
init(tokenOutput, untokenizedTypes);
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java Thu Oct 25 13:10:25 2012
@@ -171,8 +171,6 @@ public class TestTeeSinkTokenFilter exte
/**
* Not an explicit test, just useful to print out some info on performance
- *
- * @throws Exception
*/
public void performance() throws Exception {
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java Thu Oct 25 13:10:25 2012
@@ -40,7 +40,6 @@ import org.apache.lucene.analysis.core.K
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.util.LuceneTestCase.Slow;
public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
@@ -53,6 +52,9 @@ public class TestSynonymMapFilter extend
private OffsetAttribute offsetAtt;
private void add(String input, String output, boolean keepOrig) {
+ if (VERBOSE) {
+ System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
+ }
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
@@ -137,6 +139,56 @@ public class TestSynonymMapFilter extend
assertEquals(expectedUpto, expected.length);
}
+ public void testDontKeepOrig() throws Exception {
+ b = new SynonymMap.Builder(true);
+ add("a b", "foo", false);
+
+ final SynonymMap map = b.build();
+
+ final Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "a b c",
+ new String[] {"foo", "c"},
+ new int[] {0, 4},
+ new int[] {3, 5},
+ null,
+ new int[] {1, 1},
+ new int[] {1, 1},
+ true);
+ checkAnalysisConsistency(random(), analyzer, false, "a b c");
+ }
+
+ public void testDoKeepOrig() throws Exception {
+ b = new SynonymMap.Builder(true);
+ add("a b", "foo", true);
+
+ final SynonymMap map = b.build();
+
+ final Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "a b c",
+ new String[] {"a", "foo", "b", "c"},
+ new int[] {0, 0, 2, 4},
+ new int[] {1, 3, 3, 5},
+ null,
+ new int[] {1, 0, 1, 1},
+ new int[] {1, 2, 1, 1},
+ true);
+ checkAnalysisConsistency(random(), analyzer, false, "a b c");
+ }
+
public void testBasic() throws Exception {
b = new SynonymMap.Builder(true);
add("a", "foo", true);
@@ -284,7 +336,7 @@ public class TestSynonymMapFilter extend
if (synOutputs.length == 1) {
// Add full endOffset
endOffset = (inputIDX*2) + syn.in.length();
- posLen = (1+syn.in.length())/2;
+ posLen = syn.keepOrig ? (1+syn.in.length())/2 : 1;
} else {
// Add endOffset matching input token's
endOffset = (matchIDX*2) + 1;
@@ -540,6 +592,9 @@ public class TestSynonymMapFilter extend
for (int i = 0; i < numIters; i++) {
b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
+ if (VERBOSE) {
+ System.out.println("TEST: iter=" + i + " numEntries=" + numEntries);
+ }
for (int j = 0; j < numEntries; j++) {
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java Thu Oct 25 13:10:25 2012
@@ -52,9 +52,12 @@ public class TestElision extends BaseTok
private List<String> filter(TokenFilter filter) throws IOException {
List<String> tas = new ArrayList<String>();
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ filter.reset();
while (filter.incrementToken()) {
tas.add(termAtt.toString());
}
+ filter.end();
+ filter.close();
return tas;
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Thu Oct 25 13:10:25 2012
@@ -139,7 +139,7 @@ public final class ICUTokenizer extends
* Refill the buffer, accumulating the offset and setting usableLength to the
* last unambiguous break position
*
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
private void refill() throws IOException {
offset += usableLength;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java Thu Oct 25 13:10:25 2012
@@ -225,7 +225,7 @@ public class JapaneseIterationMarkCharFi
*
* @param c iteration mark character to normalize
* @return normalized iteration mark
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
private char normalizeIterationMark(char c) throws IOException {
@@ -252,7 +252,7 @@ public class JapaneseIterationMarkCharFi
* Finds the number of subsequent next iteration marks
*
* @return number of iteration marks starting at the current buffer position
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
private int nextIterationMarkSpanSize() throws IOException {
int spanSize = 0;
@@ -272,7 +272,7 @@ public class JapaneseIterationMarkCharFi
* @param position buffer position (should not exceed bufferPosition)
* @param spanSize iteration mark span size
* @return source character
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
private char sourceCharacter(int position, int spanSize) throws IOException {
return (char) buffer.get(position - spanSize);
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java Thu Oct 25 13:10:25 2012
@@ -35,6 +35,7 @@ public final class JapaneseReadingFormFi
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
+ private StringBuilder buffer = new StringBuilder();
private boolean useRomaji;
public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@@ -50,10 +51,19 @@ public final class JapaneseReadingFormFi
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String reading = readingAttr.getReading();
- if (reading != null) {
- if (useRomaji) {
- ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+
+ if (useRomaji) {
+ if (reading == null) {
+ // if its an OOV term, just try the term text
+ buffer.setLength(0);
+ ToStringUtil.getRomanization(buffer, termAttr);
+ termAttr.setEmpty().append(buffer);
} else {
+ ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+ }
+ } else {
+ // just replace the term text with the reading, if it exists
+ if (reading != null) {
termAttr.setEmpty().append(reading);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java Thu Oct 25 13:10:25 2012
@@ -227,7 +227,7 @@ public final class JapaneseTokenizer ext
outputCompounds = false;
break;
}
- buffer.reset(input);
+ buffer.reset(null); // best effort NPE consumers that don't call reset()
resetState();
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/Dictionary.java Thu Oct 25 13:10:25 2012
@@ -27,22 +27,19 @@ public interface Dictionary {
/**
* Get left id of specified word
- * @param wordId
* @return left id
*/
public int getLeftId(int wordId);
/**
* Get right id of specified word
- * @param wordId
- * @return left id
+ * @return right id
*/
public int getRightId(int wordId);
/**
* Get word cost of specified word
- * @param wordId
- * @return left id
+ * @return word's cost
*/
public int getWordCost(int wordId);
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java Thu Oct 25 13:10:25 2012
@@ -172,7 +172,6 @@ public final class UserDictionary implem
/**
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
- * @param input
* @return array of {wordId, index, length}
*/
private int[][] toIndexArray(Map<Integer, int[]> input) {
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CSVUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CSVUtil.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CSVUtil.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CSVUtil.java Thu Oct 25 13:10:25 2012
@@ -37,7 +37,7 @@ public final class CSVUtil {
/**
* Parse CSV line
- * @param line
+ * @param line line containing csv-encoded data
* @return Array of values
*/
public static String[] parse(String line) {
@@ -96,7 +96,6 @@ public final class CSVUtil {
/**
* Quote and escape input value for CSV
- * @param original
*/
public static String quoteEscape(String original) {
String result = original;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java Thu Oct 25 13:10:25 2012
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
@@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilt
new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
);
}
+
+ public void testKatakanaReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+ );
+ }
public void testRomajiReadings() throws IOException {
assertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãå
çã¨è©±ãã",
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
);
}
+
+ public void testRomajiReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+ );
+ }
public void testRandomData() throws IOException {
Random random = random();
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Thu Oct 25 13:10:25 2012
@@ -62,12 +62,16 @@ public class TestMorfologikAnalyzer exte
ts_1.reset();
ts_1.incrementToken();
assertEquals("first stream", "liÅcie", termAtt_1.toString());
+ ts_1.end();
+ ts_1.close();
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
ts_2.reset();
ts_2.incrementToken();
assertEquals("second stream", "dany", termAtt_2.toString());
+ ts_2.end();
+ ts_2.close();
}
/** Test stemming of mixed-case tokens. */
@@ -110,6 +114,7 @@ public class TestMorfologikAnalyzer exte
public final void testPOSAttribute() throws IOException {
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liÅcie"));
+ ts.reset();
assertPOSToken(ts, "liÅcie",
"subst:sg:acc:n2",
"subst:sg:nom:n2",
@@ -127,6 +132,8 @@ public class TestMorfologikAnalyzer exte
assertPOSToken(ts, "lista",
"subst:sg:dat:f",
"subst:sg:loc:f");
+ ts.end();
+ ts.close();
}
/** blast some random strings through the analyzer */
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java Thu Oct 25 13:10:25 2012
@@ -61,8 +61,8 @@ public final class BeiderMorseFilter ext
* Calls
* {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, org.apache.commons.codec.language.bm.Languages.LanguageSet)}
*
- * @param input
- * @param engine
+ * @param input TokenStream to filter
+ * @param engine configured PhoneticEngine with BM settings.
*/
public BeiderMorseFilter(TokenStream input, PhoneticEngine engine) {
this(input, engine, null);
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java Thu Oct 25 13:10:25 2012
@@ -41,12 +41,12 @@ import org.apache.lucene.analysis.util.T
* <p>
* This takes one required argument, "encoder", and the rest are optional:
* <dl>
- * <dt>encoder<dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
+ * <dt>encoder</dt><dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
* or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
- * itself if it already contains a '.' or otherwise as in the same package as these others.
- * <dt>inject<dd> (default=true) add tokens to the stream with the offset=0
- * <dt>maxCodeLength<dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
- * support this then specifying this is an error.
+ * itself if it already contains a '.' or otherwise as in the same package as these others.</dd>
+ * <dt>inject</dt><dd> (default=true) add tokens to the stream with the offset=0</dd>
+ * <dt>maxCodeLength</dt><dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
+ * support this then specifying this is an error.</dd>
* </dl>
*
* <pre class="prettyprint" >
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Thu Oct 25 13:10:25 2012
@@ -139,9 +139,7 @@ class BigramDictionary extends AbstractD
* Load the datafile into this BigramDictionary
*
* @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
- * @throws FileNotFoundException
- * @throws IOException
- * @throws UnsupportedEncodingException
+ * @throws IOException If there is a low-level I/O error
*/
public void loadFromFile(String dctFilePath) throws IOException {
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Thu Oct 25 13:10:25 2012
@@ -133,8 +133,7 @@ class WordDictionary extends AbstractDic
/**
* Load coredict.mem internally from the jar file.
*
- * @throws ClassNotFoundException
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public void load() throws IOException, ClassNotFoundException {
InputStream input = this.getClass().getResourceAsStream("coredict.mem");
@@ -181,9 +180,7 @@ class WordDictionary extends AbstractDic
*
* @param dctFilePath path to word dictionary (coredict.dct)
* @return number of words read
- * @throws FileNotFoundException
- * @throws IOException
- * @throws UnsupportedEncodingException
+ * @throws IOException If there is a low-level I/O error.
*/
private int loadMainDataFromFile(String dctFilePath) throws IOException {
int i, cnt, length, total = 0;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html Thu Oct 25 13:10:25 2012
@@ -19,11 +19,7 @@
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
-<div>
SmartChineseAnalyzer Hidden Markov Model package.
-</div>
-<div>
@lucene.experimental
-</div>
</body>
</html>
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html Thu Oct 25 13:10:25 2012
@@ -20,12 +20,8 @@
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
-<div>
Analyzer for Simplified Chinese, which indexes words.
-</div>
-<div>
@lucene.experimental
-</div>
<div>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Thu Oct 25 13:10:25 2012
@@ -54,8 +54,7 @@ public abstract class BaseUIMATokenizer
* <p/>
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
*
- * @throws AnalysisEngineProcessException
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
protected void analyzeInput() throws AnalysisEngineProcessException, IOException {
cas.reset();
@@ -66,7 +65,7 @@ public abstract class BaseUIMATokenizer
/**
* initialize the FSIterator which is used to build tokens at each incrementToken() method call
*
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
protected abstract void initializeIterator() throws IOException;
Modified: lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java Thu Oct 25 13:10:25 2012
@@ -27,9 +27,7 @@ import org.apache.uima.resource.Resource
public interface AEProvider {
/**
- *
- * @return AnalysisEngine
- * @throws ResourceInitializationException
+ * Returns the AnalysisEngine
*/
public AnalysisEngine getAE() throws ResourceInitializationException;
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/build.xml?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/build.xml (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/build.xml Thu Oct 25 13:10:25 2012
@@ -220,7 +220,8 @@
</java>
<echo>Benchmark output is in file: ${collation.output.file}</echo>
<echo>Converting to JIRA table format...</echo>
- <exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
+ <exec executable="${perl.exe}" output="${collation.jira.output.file}" failonerror="true">
+ <arg value="-CSD"/>
<arg value="scripts/collation.bm2jira.pl"/>
<arg value="${collation.output.file}"/>
</exec>
@@ -246,7 +247,8 @@
</java>
<echo>Benchmark output is in file: ${shingle.output.file}</echo>
<echo>Converting to JIRA table format...</echo>
- <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
+ <exec executable="${perl.exe}" output="${shingle.jira.output.file}" failonerror="true">
+ <arg value="-CSD"/>
<arg value="scripts/shingle.bm2jira.pl"/>
<arg value="${shingle.output.file}"/>
</exec>
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Thu Oct 25 13:10:25 2012
@@ -36,7 +36,7 @@ public interface HTMLParser {
* @param reader reader of html text to parse.
* @param trecSrc the {@link TrecContentSource} used to parse dates.
* @return Parsed doc data.
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/QueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/QueryMaker.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/QueryMaker.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/QueryMaker.java Thu Oct 25 13:10:25 2012
@@ -37,8 +37,7 @@ public interface QueryMaker {
/** Create the next query */
public Query makeQuery () throws Exception;
- /** Set the properties
- * @throws Exception */
+ /** Set the properties */
public void setConfig (Config config) throws Exception;
/** Reset inputs so that the test run would behave, input wise, as if it just started. */
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Thu Oct 25 13:10:25 2012
@@ -130,8 +130,8 @@ public class TrecContentSource extends C
* @param lineStart line start to look for, must not be null.
* @param collectMatchLine whether to collect the matching line into <code>buffer</code>.
* @param collectAll whether to collect all lines into <code>buffer</code>.
- * @throws IOException
- * @throws NoMoreDataException
+ * @throws IOException If there is a low-level I/O error.
+ * @throws NoMoreDataException If the source is exhausted.
*/
private void read(StringBuilder buf, String lineStart,
boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException {
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Thu Oct 25 13:10:25 2012
@@ -24,7 +24,7 @@ import java.util.Locale;
import java.util.Map;
/**
- * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
+ * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
*/
public abstract class TrecDocParser {
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html Thu Oct 25 13:10:25 2012
@@ -20,8 +20,8 @@
<TITLE>Benchmarking Lucene By Tasks</TITLE>
</HEAD>
<BODY>
-<DIV>
Benchmarking Lucene By Tasks.
+<DIV>
<p>
This package provides "task based" performance benchmarking of Lucene.
One can use the predefined benchmarks, or create new ones.
@@ -251,7 +251,7 @@ The following is an informal description
fixed, so for deletion in loops it is better to use the
<code>doc.delete.step</code> property.
</li>
- <li><b>SetProp</b> takes a <code>name,value<code> mandatory param,
+ <li><b>SetProp</b> takes a <code>name,value</code> mandatory param,
',' used as a separator.
</li>
<li><b>SearchTravRetTask</b> and <b>SearchTravTask</b> take a numeric
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java Thu Oct 25 13:10:25 2012
@@ -32,11 +32,6 @@ import org.apache.lucene.benchmark.byTas
*/
public class Sample {
- /**
- * @param args
- * @throws Exception
- * @throws IOException
- */
public static void main(String[] args) throws Exception {
Properties p = initProps();
Config conf = new Config(p);
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Thu Oct 25 13:10:25 2012
@@ -109,7 +109,6 @@ public class TaskSequence extends PerfTa
/**
* @param repetitions The repetitions to set.
- * @throws Exception
*/
public void setRepetitions(int repetitions) throws Exception {
fixedTime = false;
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java Thu Oct 25 13:10:25 2012
@@ -295,7 +295,6 @@ public class Algorithm {
/**
* Execute this algorithm
- * @throws Exception
*/
public void execute() throws Exception {
try {
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java Thu Oct 25 13:10:25 2012
@@ -58,7 +58,7 @@ public class Config {
* Read both algorithm and config properties.
*
* @param algReader from where to read algorithm and config properties.
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public Config(Reader algReader) throws IOException {
// read alg file to array of lines
@@ -163,7 +163,6 @@ public class Config {
*
* @param name name of property.
* @param value either single or multiple property value (multiple values are separated by ":")
- * @throws Exception
*/
public void set(String name, String value) throws Exception {
if (valByRound.get(name) != null) {
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/FileUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/FileUtils.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/FileUtils.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/FileUtils.java Thu Oct 25 13:10:25 2012
@@ -30,7 +30,7 @@ public class FileUtils {
*
* @param dir file or directory
* @return true on success, false if no or part of files have been deleted
- * @throws java.io.IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public static boolean fullyDelete(File dir) throws IOException {
if (dir == null || !dir.exists()) return false;
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/package.html?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/package.html Thu Oct 25 13:10:25 2012
@@ -20,9 +20,10 @@
<TITLE>Lucene Benchmarking Package</TITLE>
</HEAD>
<BODY>
+The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora.
<DIV>
- <p/>
- The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora. ANT will
+<p/>
+ ANT will
download the corpus automatically, place it in a temp directory and then unpack it to the working.dir directory specified in the build.
The temp directory
and working directory can be safely removed after a run. However, the next time the task is run, it will need to download the files again.
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java Thu Oct 25 13:10:25 2012
@@ -46,7 +46,7 @@ public class TrecJudge implements Judge
* 19 0 doc7295 0
* </pre>
* @param reader where judgments are read from.
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public TrecJudge (BufferedReader reader) throws IOException {
judgements = new HashMap<String,QRelJudgement>();
Modified: lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java Thu Oct 25 13:10:25 2012
@@ -74,8 +74,6 @@ public class ExtractReuters {
/**
* Override if you wish to change what is extracted
- *
- * @param sgmFile
*/
protected void extractFile(File sgmFile) {
try {
Modified: lucene/dev/branches/LUCENE-2878/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/build.xml?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/build.xml (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/build.xml Thu Oct 25 13:10:25 2012
@@ -33,11 +33,12 @@
<patternset id="binary.root.dist.patterns"
includes="LICENSE.txt,NOTICE.txt,README.txt,
MIGRATE.txt,JRE_VERSION_MIGRATION.txt,
+ SYSTEM_REQUIREMENTS.txt,
CHANGES.txt,
**/lib/*.jar,
licenses/**,
*/docs/,**/README*"
- excludes="build/**,site/**,tools/**"
+ excludes="build/**,site/**,tools/**,**/lib/*servlet-api*.jar"
/>
@@ -51,7 +52,8 @@
</ant>
</target>
- <target name="test" depends="test-core, test-modules, test-backwards"
+ <!-- "-clover.load" is *not* a useless dependency. do not remove -->
+ <target name="test" depends="-clover.load, test-core, test-modules, test-backwards"
description="Runs all unit tests (core, modules and back-compat)"
/>
@@ -224,51 +226,71 @@
<target name="javadoc" depends="javadocs"/>
<target name="javadocs" description="Generate javadoc" depends="javadocs-lucene-core, javadocs-modules, javadocs-test-framework"/>
+ <target name="documentation-lint" depends="-ecj-javadoc-lint,-documentation-lint,-documentation-lint-unsupported"
+ description="Validates the generated documentation (HTML errors, broken links,...)"/>
+
<!-- we check for broken links across all documentation -->
- <target name="documentation-lint" depends="documentation">
- <sequential>
- <check-broken-links dir="build/docs"/>
- <!-- TODO: change this level=method -->
- <check-missing-javadocs dir="build/docs" level="class"/>
- <!-- too many classes to fix overall to just enable
- the above to be level="method" right now, but we
- can prevent the modules that don't have problems
- from getting any worse -->
- <!-- analyzers-common: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
- <!-- analyzers-kuromoji: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
- <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
- <!-- analyzers-smartcn: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
- <!-- analyzers-uima: problems -->
- <!-- benchmark: problems -->
- <check-missing-javadocs dir="build/docs/classification" level="method"/>
- <!-- codecs: problems -->
- <!-- core: problems -->
- <check-missing-javadocs dir="build/docs/demo" level="method"/>
- <!-- facet: problems -->
- <!-- grouping: problems -->
- <!-- highlighter: problems -->
- <check-missing-javadocs dir="build/docs/join" level="method"/>
- <check-missing-javadocs dir="build/docs/memory" level="method"/>
- <!-- misc: problems -->
- <!-- queries: problems -->
- <!-- queryparser: problems -->
- <!-- sandbox: problems -->
- <!-- spatial: problems -->
- <check-missing-javadocs dir="build/docs/suggest" level="method"/>
- <!-- test-framework: problems -->
-
- <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
- </sequential>
+ <target name="-documentation-lint" if="documentation-lint.supported" depends="documentation">
+ <echo message="checking for broken html..."/>
+ <jtidy-macro>
+ <!-- NOTE: must currently exclude deprecated-list due to a javadocs bug (as of 1.7.0_09)
+ javadocs generates invalid XML if you deprecate a method that takes a parameter
+ with a generic type -->
+ <fileset dir="build/docs" includes="**/*.html" excludes="**/deprecated-list.html"/>
+ </jtidy-macro>
+ <echo message="Checking for broken links..."/>
+ <check-broken-links dir="build/docs"/>
+ <echo message="Checking for missing docs..."/>
+ <!-- TODO: change this level=method -->
+ <check-missing-javadocs dir="build/docs" level="class"/>
+ <!-- too many classes to fix overall to just enable
+ the above to be level="method" right now, but we
+ can prevent the modules that don't have problems
+ from getting any worse -->
+ <!-- analyzers-common: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
+ <!-- analyzers-kuromoji: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
+ <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
+ <!-- analyzers-smartcn: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
+ <!-- analyzers-uima: problems -->
+ <!-- benchmark: problems -->
+ <check-missing-javadocs dir="build/docs/classification" level="method"/>
+ <!-- codecs: problems -->
+ <!-- core: problems -->
+ <check-missing-javadocs dir="build/docs/demo" level="method"/>
+ <!-- facet: problems -->
+ <!-- grouping: problems -->
+ <!-- highlighter: problems -->
+ <check-missing-javadocs dir="build/docs/join" level="method"/>
+ <check-missing-javadocs dir="build/docs/memory" level="method"/>
+ <!-- misc: problems -->
+ <!-- queries: problems -->
+ <!-- queryparser: problems -->
+ <!-- sandbox: problems -->
+ <!-- spatial: problems -->
+ <check-missing-javadocs dir="build/docs/suggest" level="method"/>
+ <!-- test-framework: problems -->
+
+ <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
</target>
- <target name="process-webpages" depends="resolve-pegdown">
+ <target name="-ecj-javadoc-lint" depends="documentation,compile-test-framework,-ecj-resolve">
+ <subant target="-ecj-javadoc-lint" failonerror="true" inheritall="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ <fileset dir="core" includes="build.xml"/>
+ <fileset dir="test-framework" includes="build.xml"/>
+ </subant>
+ <modules-crawl target="-ecj-javadoc-lint"/>
+ </target>
+
+ <target name="process-webpages" depends="resolve-groovy,resolve-pegdown">
<makeurl property="process-webpages.buildfiles" separator="|">
<fileset dir="." includes="**/build.xml" excludes="build.xml,analysis/*,build/**,tools/**,backwards/**,site/**"/>
</makeurl>
@@ -287,7 +309,7 @@
</xslt>
<pegdown todir="${javadoc.dir}">
- <fileset dir="." includes="MIGRATE.txt,JRE_VERSION_MIGRATION.txt"/>
+ <fileset dir="." includes="MIGRATE.txt,JRE_VERSION_MIGRATION.txt,SYSTEM_REQUIREMENTS.txt"/>
<globmapper from="*.txt" to="*.html"/>
</pegdown>
@@ -406,8 +428,9 @@
<!-- Exclude clover license files incompatible with the ASL -->
<delete dir="${svn.export.dir}/tools/clover"/>
- <build-changes changes.src.dir="${svn.export.dir}/site/changes"
- changes.target.dir="${svn.export.dir}/docs/changes"/>
+ <build-changes changes.src.file="${svn.export.dir}/CHANGES.txt"
+ changes.target.dir="${svn.export.dir}/docs/changes"
+ changes.product="LUCENE"/>
<tar tarfile="${source.package.file}" compression="gzip" longfile="gnu">
<tarfileset prefix="lucene-${version}" dir="${svn.export.dir}"/>
</tar>
@@ -498,7 +521,7 @@
</target>
<target name="changes-to-html">
- <build-changes changes.src.dir="${changes.src.dir}" changes.target.dir="${changes.target.dir}" />
+ <build-changes changes.product="LUCENE"/>
</target>
<target name="pitest-modules" depends="compile-test">
@@ -533,16 +556,6 @@
<property name="lucene-core.uptodate" value="true"/>
</target>
- <!-- TODO: in the future, we don't need to actually put
- jars in the lib/ folders, but can just put in classpath.
- only packaging tasks really need that (and could do it
- under build/ directories) -->
- <target name="clean-jars" description="Clean local jars">
- <delete>
- <fileset dir="." includes="**/*.jar"/>
- </delete>
- </target>
-
<target name="get-jenkins-line-docs" unless="enwiki.exists">
<sequential>
<!-- TODO: can get .lzma instead (it's ~17% smaller) but there's no builtin ant support...? -->
@@ -552,7 +565,7 @@
</sequential>
</target>
- <target name="jar-checksums" depends="clean-jars,resolve">
+ <target name="jar-checksums" depends="resolve">
<jar-checksum-macro srcdir="${common.dir}" dstdir="${common.dir}/licenses"/>
</target>
Modified: lucene/dev/branches/LUCENE-2878/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/classification/build.xml?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/classification/build.xml (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/classification/build.xml Thu Oct 25 13:10:25 2012
@@ -23,4 +23,30 @@
</description>
<import file="../module-build.xml"/>
+
+ <path id="base.classpath">
+ <pathelement location="${common.dir}/build/core/classes/java"/>
+ <pathelement path="${queries.jar}"/>
+ <pathelement path="${project.classpath}"/>
+ </path>
+
+ <path id="test.classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <pathelement location="${common.dir}/build/test-framework/classes/java"/>
+ <pathelement location="${common.dir}/build/codecs/classes/java"/>
+ <path refid="classpath"/>
+ <path refid="junit-path"/>
+ <pathelement location="${build.dir}/classes/java"/>
+ </path>
+
+ <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
+ <target name="javadocs" depends="javadocs-queries,compile-core">
+ <invoke-module-javadoc>
+ <links>
+ <link href="../queries"/>
+ </links>
+ </invoke-module-javadoc>
+ </target>
+
</project>
Modified: lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Thu Oct 25 13:10:25 2012
@@ -29,12 +29,12 @@ import java.io.IOException;
public interface Classifier {
/**
- * Assign a class to the given text String
+ * Assign a class (with score) to the given text String
* @param text a String containing text to be classified
- * @return a String representing a class
- * @throws IOException
+ * @return a {@link ClassificationResult} holding assigned class and score
+ * @throws IOException If there is a low-level I/O error.
*/
- public String assignClass(String text) throws IOException;
+ public ClassificationResult assignClass(String text) throws IOException;
/**
* Train the classifier using the underlying Lucene index
@@ -42,7 +42,7 @@ public interface Classifier {
* @param textFieldName the name of the field used to compare documents
* @param classFieldName the name of the field containing the class assigned to documents
* @param analyzer the analyzer used to tokenize / filter the unseen text
- * @throws IOException
+ * @throws IOException If there is a low-level I/O error.
*/
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
throws IOException;
Modified: lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Thu Oct 25 13:10:25 2012
@@ -80,7 +80,7 @@ public class SimpleNaiveBayesClassifier
return result.toArray(new String[result.size()]);
}
- public String assignClass(String inputDocument) throws IOException {
+ public ClassificationResult assignClass(String inputDocument) throws IOException {
if (atomicReader == null) {
throw new RuntimeException("need to train the classifier first");
}
@@ -98,7 +98,7 @@ public class SimpleNaiveBayesClassifier
foundClass = next.utf8ToString();
}
}
- return foundClass;
+ return new ClassificationResult(foundClass, max);
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/classification/src/java/org/apache/lucene/classification/package.html Thu Oct 25 13:10:25 2012
@@ -18,6 +18,6 @@
<body>
Uses already seen data (the indexed documents) to classify new documents.
Currently only contains a (simplistic) Lucene based Naive Bayes classifier
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
</body>
</html>
Modified: lucene/dev/branches/LUCENE-2878/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Thu Oct 25 13:10:25 2012
@@ -19,112 +19,32 @@ package org.apache.lucene.classification
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.junit.Test;
+import java.io.Reader;
+
/**
* Testcase for {@link SimpleNaiveBayesClassifier}
*/
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
- private RandomIndexWriter indexWriter;
- private String textFieldName;
- private String classFieldName;
- private Analyzer analyzer;
- private Directory dir;
-
- @Before
- public void setUp() throws Exception {
- super.setUp();
- analyzer = new MockAnalyzer(random());
- dir = newDirectory();
- indexWriter = new RandomIndexWriter(random(), dir);
- textFieldName = "text";
- classFieldName = "cat";
- }
-
- @After
- public void tearDown() throws Exception {
- super.tearDown();
- indexWriter.close();
- dir.close();
- }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
@Test
public void testBasicUsage() throws Exception {
- SlowCompositeReaderWrapper compositeReaderWrapper = null;
- try {
- populateIndex();
- SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
- compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
- simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
- String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
- assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
- } finally {
- if (compositeReaderWrapper != null)
- compositeReaderWrapper.close();
- }
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
}
- private void populateIndex() throws Exception {
-
- Document doc = new Document();
- doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
- "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
- "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
- "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
- "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
- " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
+ @Test
+ public void testNGramUsage() throws Exception {
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+ }
- indexWriter.commit();
+ private class NGramAnalyzer extends Analyzer {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+ 10, 20));
+ }
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java Thu Oct 25 13:10:25 2012
@@ -76,7 +76,9 @@ public class BlockTermsReader extends Fi
private TermsIndexReaderBase indexReader;
// keeps the dirStart offset
- protected long dirOffset;
+ private long dirOffset;
+
+ private final int version;
// Used as key for the terms cache
private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
@@ -123,7 +125,7 @@ public class BlockTermsReader extends Fi
boolean success = false;
try {
- readHeader(in);
+ version = readHeader(in);
// Have PostingsReader init itself
postingsReader.init(in);
@@ -168,15 +170,21 @@ public class BlockTermsReader extends Fi
this.indexReader = indexReader;
}
- protected void readHeader(IndexInput input) throws IOException {
- CodecUtil.checkHeader(input, BlockTermsWriter.CODEC_NAME,
+ private int readHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, BlockTermsWriter.CODEC_NAME,
BlockTermsWriter.VERSION_START,
BlockTermsWriter.VERSION_CURRENT);
- dirOffset = input.readLong();
+ if (version < BlockTermsWriter.VERSION_APPEND_ONLY) {
+ dirOffset = input.readLong();
+ }
+ return version;
}
- protected void seekDir(IndexInput input, long dirOffset)
- throws IOException {
+ private void seekDir(IndexInput input, long dirOffset) throws IOException {
+ if (version >= BlockTermsWriter.VERSION_APPEND_ONLY) {
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
+ }
input.seek(dirOffset);
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java Thu Oct 25 13:10:25 2012
@@ -58,8 +58,8 @@ public class BlockTermsWriter extends Fi
// Initial format
public static final int VERSION_START = 0;
-
- public static final int VERSION_CURRENT = VERSION_START;
+ public static final int VERSION_APPEND_ONLY = 1;
+ public static final int VERSION_CURRENT = VERSION_APPEND_ONLY;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tib";
@@ -69,7 +69,27 @@ public class BlockTermsWriter extends Fi
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
- private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
+
+ private static class FieldMetaData {
+ public final FieldInfo fieldInfo;
+ public final long numTerms;
+ public final long termsStartPointer;
+ public final long sumTotalTermFreq;
+ public final long sumDocFreq;
+ public final int docCount;
+
+ public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ this.termsStartPointer = termsStartPointer;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ }
+ }
+
+ private final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
// private final String segment;
@@ -98,10 +118,8 @@ public class BlockTermsWriter extends Fi
}
}
- protected void writeHeader(IndexOutput out) throws IOException {
- CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
-
- out.writeLong(0); // leave space for end index pointer
+ private void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
}
@Override
@@ -110,9 +128,7 @@ public class BlockTermsWriter extends Fi
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field, out.getFilePointer());
- final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
- fields.add(terms);
- return terms;
+ return new TermsWriter(fieldIndexWriter, field, postingsWriter);
}
@Override
@@ -120,27 +136,18 @@ public class BlockTermsWriter extends Fi
try {
- int nonZeroCount = 0;
- for(TermsWriter field : fields) {
- if (field.numTerms > 0) {
- nonZeroCount++;
- }
- }
-
final long dirStart = out.getFilePointer();
- out.writeVInt(nonZeroCount);
- for(TermsWriter field : fields) {
- if (field.numTerms > 0) {
- out.writeVInt(field.fieldInfo.number);
- out.writeVLong(field.numTerms);
- out.writeVLong(field.termsStartPointer);
- if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
- out.writeVLong(field.sumTotalTermFreq);
- }
- out.writeVLong(field.sumDocFreq);
- out.writeVInt(field.docCount);
+ out.writeVInt(fields.size());
+ for(FieldMetaData field : fields) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVLong(field.termsStartPointer);
+ if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ out.writeVLong(field.sumTotalTermFreq);
}
+ out.writeVLong(field.sumDocFreq);
+ out.writeVInt(field.docCount);
}
writeTrailer(dirStart);
} finally {
@@ -148,8 +155,7 @@ public class BlockTermsWriter extends Fi
}
}
- protected void writeTrailer(long dirStart) throws IOException {
- out.seek(CodecUtil.headerLength(CODEC_NAME));
+ private void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
@@ -252,6 +258,14 @@ public class BlockTermsWriter extends Fi
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
fieldIndexWriter.finish(out.getFilePointer());
+ if (numTerms > 0) {
+ fields.add(new FieldMetaData(fieldInfo,
+ numTerms,
+ termsStartPointer,
+ sumTotalTermFreq,
+ sumDocFreq,
+ docCount));
+ }
}
private int sharedPrefix(BytesRef term1, BytesRef term2) {
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java Thu Oct 25 13:10:25 2012
@@ -70,7 +70,9 @@ public class FixedGapTermsIndexReader ex
final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<FieldInfo,FieldIndexData>();
// start of the field info data
- protected long dirOffset;
+ private long dirOffset;
+
+ private final int version;
public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
throws IOException {
@@ -85,7 +87,7 @@ public class FixedGapTermsIndexReader ex
try {
- readHeader(in);
+ version = readHeader(in);
indexInterval = in.readInt();
if (indexInterval < 1) {
throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
@@ -148,10 +150,13 @@ public class FixedGapTermsIndexReader ex
return indexDivisor;
}
- protected void readHeader(IndexInput input) throws IOException {
- CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
- FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_START);
- dirOffset = input.readLong();
+ private int readHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
+ FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_CURRENT);
+ if (version < FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
+ dirOffset = input.readLong();
+ }
+ return version;
}
private class IndexEnum extends FieldIndexEnum {
@@ -409,7 +414,11 @@ public class FixedGapTermsIndexReader ex
}
}
- protected void seekDir(IndexInput input, long dirOffset) throws IOException {
+ private void seekDir(IndexInput input, long dirOffset) throws IOException {
+ if (version >= FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
+ }
input.seek(dirOffset);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java Thu Oct 25 13:10:25 2012
@@ -49,7 +49,8 @@ public class FixedGapTermsIndexWriter ex
final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
final static int VERSION_START = 0;
- final static int VERSION_CURRENT = VERSION_START;
+ final static int VERSION_APPEND_ONLY = 1;
+ final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
final private int termIndexInterval;
@@ -74,10 +75,8 @@ public class FixedGapTermsIndexWriter ex
}
}
- protected void writeHeader(IndexOutput out) throws IOException {
+ private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
- // Placeholder for dir offset
- out.writeLong(0);
}
@Override
@@ -250,8 +249,7 @@ public class FixedGapTermsIndexWriter ex
}
}
- protected void writeTrailer(long dirStart) throws IOException {
- out.seek(CodecUtil.headerLength(CODEC_NAME));
+ private void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java Thu Oct 25 13:10:25 2012
@@ -54,7 +54,9 @@ public class VariableGapTermsIndexReader
final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<FieldInfo,FieldIndexData>();
// start of the field info data
- protected long dirOffset;
+ private long dirOffset;
+
+ private final int version;
final String segment;
public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, String segmentSuffix, IOContext context)
@@ -66,7 +68,7 @@ public class VariableGapTermsIndexReader
try {
- readHeader(in);
+ version = readHeader(in);
this.indexDivisor = indexDivisor;
seekDir(in, dirOffset);
@@ -103,10 +105,13 @@ public class VariableGapTermsIndexReader
return indexDivisor;
}
- protected void readHeader(IndexInput input) throws IOException {
- CodecUtil.checkHeader(input, VariableGapTermsIndexWriter.CODEC_NAME,
- VariableGapTermsIndexWriter.VERSION_START, VariableGapTermsIndexWriter.VERSION_START);
- dirOffset = input.readLong();
+ private int readHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, VariableGapTermsIndexWriter.CODEC_NAME,
+ VariableGapTermsIndexWriter.VERSION_START, VariableGapTermsIndexWriter.VERSION_CURRENT);
+ if (version < VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
+ dirOffset = input.readLong();
+ }
+ return version;
}
private static class IndexEnum extends FieldIndexEnum {
@@ -229,7 +234,11 @@ public class VariableGapTermsIndexReader
}
}
- protected void seekDir(IndexInput input, long dirOffset) throws IOException {
+ private void seekDir(IndexInput input, long dirOffset) throws IOException {
+ if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY) {
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
+ }
input.seek(dirOffset);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java Thu Oct 25 13:10:25 2012
@@ -52,7 +52,8 @@ public class VariableGapTermsIndexWriter
final static String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
final static int VERSION_START = 0;
- final static int VERSION_CURRENT = VERSION_START;
+ final static int VERSION_APPEND_ONLY = 1;
+ final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
private final List<FSTFieldWriter> fields = new ArrayList<FSTFieldWriter>();
@@ -189,10 +190,8 @@ public class VariableGapTermsIndexWriter
}
}
- protected void writeHeader(IndexOutput out) throws IOException {
+ private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
- // Placeholder for dir offset
- out.writeLong(0);
}
@Override
@@ -316,8 +315,7 @@ public class VariableGapTermsIndexWriter
}
}
- protected void writeTrailer(long dirStart) throws IOException {
- out.seek(CodecUtil.headerLength(CODEC_NAME));
+ private void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}
Modified: lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java?rev=1402140&r1=1402139&r2=1402140&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (original)
+++ lucene/dev/branches/LUCENE-2878/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java Thu Oct 25 13:10:25 2012
@@ -159,6 +159,7 @@ public final class BloomFilteringPosting
String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexInput bloomIn = null;
+ boolean success = false;
try {
bloomIn = state.dir.openInput(bloomFileName, state.context);
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
@@ -178,10 +179,13 @@ public final class BloomFilteringPosting
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
}
- } finally {
IOUtils.close(bloomIn);
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(bloomIn, delegateFieldsProducer);
+ }
}
-
}
public Iterator<String> iterator() {