You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 20:58:44 UTC
svn commit: r1534320 [6/39] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/lucene/expressions/
dev-tools/idea/solr/contrib/velocity/ dev-tools/maven/
dev-tools/maven/lucene/ dev-tools/maven/lucene/expressions/ ...
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Mon Oct 21 18:58:24 2013
@@ -44,16 +44,16 @@ public class TestMorfologikAnalyzer exte
/** Test stemming of single tokens with Morfologik library. */
public final void testSingleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
- assertAnalyzesToReuse(a, "a", new String[] { "a" });
- assertAnalyzesToReuse(a, "liÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
- assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "daÄ" });
- assertAnalyzesToReuse(a, "ÄóÄ
ÅÅżźÄÅ", new String[] { "ÄóÄ
ÅÅżźÄÅ" });
+ assertAnalyzesTo(a, "a", new String[] { "a" });
+ assertAnalyzesTo(a, "liÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
+ assertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "daÄ" });
+ assertAnalyzesTo(a, "ÄóÄ
ÅÅżźÄÅ", new String[] { "ÄóÄ
ÅÅżźÄÅ" });
}
/** Test stemming of multiple tokens and proper term metrics. */
public final void testMultipleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
- assertAnalyzesToReuse(
+ assertAnalyzesTo(
a,
"liÅcie danych",
new String[] { "liÅcie", "liÅÄ", "list", "lista", "dany", "dana", "dane", "daÄ" },
@@ -61,7 +61,7 @@ public class TestMorfologikAnalyzer exte
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
- assertAnalyzesToReuse(
+ assertAnalyzesTo(
a,
"T. Gl\u00FCcksberg",
new String[] { "tom", "tona", "Gl\u00FCcksberg" },
@@ -72,50 +72,52 @@ public class TestMorfologikAnalyzer exte
@SuppressWarnings("unused")
private void dumpTokens(String input) throws IOException {
- TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
- ts.reset();
+ try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", input)) {
+ ts.reset();
- MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
- CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
- while (ts.incrementToken()) {
- System.out.println(charTerm.toString() + " => " + attribute.getTags());
+ MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
+ CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()) {
+ System.out.println(charTerm.toString() + " => " + attribute.getTags());
+ }
+ ts.end();
}
}
/** Test reuse of MorfologikFilter with leftover stems. */
public final void testLeftoverStems() throws IOException {
Analyzer a = getTestAnalyzer();
- TokenStream ts_1 = a.tokenStream("dummy", "liÅcie");
- CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
- ts_1.reset();
- ts_1.incrementToken();
- assertEquals("first stream", "liÅcie", termAtt_1.toString());
- ts_1.end();
- ts_1.close();
-
- TokenStream ts_2 = a.tokenStream("dummy", "danych");
- CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
- ts_2.reset();
- ts_2.incrementToken();
- assertEquals("second stream", "dany", termAtt_2.toString());
- ts_2.end();
- ts_2.close();
+ try (TokenStream ts_1 = a.tokenStream("dummy", "liÅcie")) {
+ CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
+ ts_1.reset();
+ ts_1.incrementToken();
+ assertEquals("first stream", "liÅcie", termAtt_1.toString());
+ ts_1.end();
+ }
+
+ try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) {
+ CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
+ ts_2.reset();
+ ts_2.incrementToken();
+ assertEquals("second stream", "dany", termAtt_2.toString());
+ ts_2.end();
+ }
}
/** Test stemming of mixed-case tokens. */
public final void testCase() throws IOException {
Analyzer a = getTestAnalyzer();
- assertAnalyzesToReuse(a, "AGD", new String[] { "AGD", "artykuÅy gospodarstwa domowego" });
- assertAnalyzesToReuse(a, "agd", new String[] { "artykuÅy gospodarstwa domowego" });
+ assertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuÅy gospodarstwa domowego" });
+ assertAnalyzesTo(a, "agd", new String[] { "artykuÅy gospodarstwa domowego" });
- assertAnalyzesToReuse(a, "Poznania", new String[] { "PoznaÅ" });
- assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznaÄ" });
+ assertAnalyzesTo(a, "Poznania", new String[] { "PoznaÅ" });
+ assertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznaÄ" });
- assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
- assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
+ assertAnalyzesTo(a, "Aarona", new String[] { "Aaron" });
+ assertAnalyzesTo(a, "aarona", new String[] { "aarona" });
- assertAnalyzesToReuse(a, "LiÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
+ assertAnalyzesTo(a, "LiÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
}
private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
@@ -140,28 +142,27 @@ public class TestMorfologikAnalyzer exte
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
- TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liÅcie");
-
- ts.reset();
- assertPOSToken(ts, "liÅcie",
+ try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liÅcie")) {
+ ts.reset();
+ assertPOSToken(ts, "liÅcie",
"subst:sg:acc:n2",
"subst:sg:nom:n2",
"subst:sg:voc:n2");
- assertPOSToken(ts, "liÅÄ",
+ assertPOSToken(ts, "liÅÄ",
"subst:pl:acc:m3",
"subst:pl:nom:m3",
"subst:pl:voc:m3");
- assertPOSToken(ts, "list",
+ assertPOSToken(ts, "list",
"subst:sg:loc:m3",
"subst:sg:voc:m3");
- assertPOSToken(ts, "lista",
+ assertPOSToken(ts, "lista",
"subst:sg:dat:f",
"subst:sg:loc:f");
- ts.end();
- ts.close();
+ ts.end();
+ }
}
/** */
@@ -183,7 +184,7 @@ public class TestMorfologikAnalyzer exte
}
};
- assertAnalyzesToReuse(
+ assertAnalyzesTo(
a,
"liÅcie danych",
new String[] { "liÅcie", "dany", "dana", "dane", "daÄ" },
Modified: lucene/dev/branches/lucene4956/lucene/analysis/phonetic/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/phonetic/ivy.xml?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/phonetic/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/phonetic/ivy.xml Mon Oct 21 18:58:24 2013
@@ -19,7 +19,7 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-phonetic"/>
<dependencies>
- <dependency org="commons-codec" name="commons-codec" rev="1.7" transitive="false"/>
+ <dependency org="commons-codec" name="commons-codec" rev="${/commons-codec/commons-codec}" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java Mon Oct 21 18:58:24 2013
@@ -105,6 +105,6 @@ public class DoubleMetaphoneFilterTest e
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, random().nextBoolean()));
}
};
- checkOneTermReuse(a, "", "");
+ checkOneTerm(a, "", "");
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java Mon Oct 21 18:58:24 2013
@@ -106,7 +106,7 @@ public class TestBeiderMorseFilter exten
return new TokenStreamComponents(tokenizer, new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)));
}
};
- checkOneTermReuse(a, "", "");
+ checkOneTerm(a, "", "");
}
public void testCustomAttribute() throws IOException {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java Mon Oct 21 18:58:24 2013
@@ -51,27 +51,6 @@ public class TestDoubleMetaphoneFilterFa
assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
}
- /**
- * Ensure that reset() removes any state (buffered tokens)
- */
- public void testReset() throws Exception {
- DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(new HashMap<String, String>());
- TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false);
-
- TokenStream filteredStream = factory.create(inputStream);
- CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
- assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
-
- filteredStream.reset();
- assertTrue(filteredStream.incrementToken());
- assertEquals(13, termAtt.length());
- assertEquals("international", termAtt.toString());
- filteredStream.reset();
-
- // ensure there are no more tokens, such as ANTRNXNL
- assertFalse(filteredStream.incrementToken());
- }
-
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java Mon Oct 21 18:58:24 2013
@@ -113,7 +113,7 @@ public class TestPhoneticFilter extends
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, random().nextBoolean()));
}
};
- checkOneTermReuse(a, "", "");
+ checkOneTerm(a, "", "");
}
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Mon Oct 21 18:58:24 2013
@@ -108,11 +108,13 @@ public final class SentenceTokenizer ext
@Override
public void reset() throws IOException {
+ super.reset();
tokenStart = tokenEnd = 0;
}
@Override
- public void end() {
+ public void end() throws IOException {
+ super.end();
// set final offset
final int finalOffset = correctOffset(tokenEnd);
offsetAtt.setOffset(finalOffset, finalOffset);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Mon Oct 21 18:58:24 2013
@@ -79,7 +79,7 @@ public class TestSmartChineseAnalyzer ex
String result[] = { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
", "," };
for (Analyzer analyzer : analyzers) {
assertAnalyzesTo(analyzer, sentence, result);
- assertAnalyzesToReuse(analyzer, sentence, result);
+ assertAnalyzesTo(analyzer, sentence, result);
}
}
@@ -167,11 +167,11 @@ public class TestSmartChineseAnalyzer ex
public void testReusableTokenStream() throws Exception {
Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
- assertAnalyzesToReuse(a, "æè´ä¹° Tests äºéå
·åæè£
",
+ assertAnalyzesTo(a, "æè´ä¹° Tests äºéå
·åæè£
",
new String[] { "æ", "è´ä¹°", "test", "äº", "éå
·", "å", "æè£
"},
new int[] { 0, 1, 4, 10, 11, 13, 14 },
new int[] { 1, 3, 9, 11, 13, 14, 16 });
- assertAnalyzesToReuse(a, "æè´ä¹°äºéå
·åæè£
ã",
+ assertAnalyzesTo(a, "æè´ä¹°äºéå
·åæè£
ã",
new String[] { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
" },
new int[] { 0, 1, 3, 4, 6, 7 },
new int[] { 1, 3, 4, 6, 7, 9 });
@@ -184,9 +184,11 @@ public class TestSmartChineseAnalyzer ex
sb.append("æè´ä¹°äºéå
·åæè£
ã");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
- TokenStream stream = analyzer.tokenStream("", sb.toString());
- stream.reset();
- while (stream.incrementToken()) {
+ try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
+ stream.reset();
+ while (stream.incrementToken()) {
+ }
+ stream.end();
}
}
@@ -197,9 +199,11 @@ public class TestSmartChineseAnalyzer ex
sb.append("æè´ä¹°äºéå
·åæè£
");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
- TokenStream stream = analyzer.tokenStream("", sb.toString());
- stream.reset();
- while (stream.incrementToken()) {
+ try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
+ stream.reset();
+ while (stream.incrementToken()) {
+ }
+ stream.end();
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Mon Oct 21 18:58:24 2013
@@ -34,8 +34,8 @@ public class TestPolishAnalyzer extends
public void testBasics() throws IOException {
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT);
// stemming
- checkOneTermReuse(a, "studenta", "student");
- checkOneTermReuse(a, "studenci", "student");
+ checkOneTerm(a, "studenta", "student");
+ checkOneTerm(a, "studenci", "student");
// stopword
assertAnalyzesTo(a, "byÅ", new String[] {});
}
@@ -45,8 +45,8 @@ public class TestPolishAnalyzer extends
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("studenta"), false);;
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
- checkOneTermReuse(a, "studenta", "studenta");
- checkOneTermReuse(a, "studenci", "student");
+ checkOneTerm(a, "studenta", "studenta");
+ checkOneTerm(a, "studenci", "student");
}
/** blast some random strings through the analyzer */
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/ivy.xml?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/ivy.xml Mon Oct 21 18:58:24 2013
@@ -19,9 +19,9 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-uima"/>
<dependencies>
- <dependency org="org.apache.uima" name="Tagger" rev="2.3.1" transitive="false"/>
- <dependency org="org.apache.uima" name="WhitespaceTokenizer" rev="2.3.1" transitive="false"/>
- <dependency org="org.apache.uima" name="uimaj-core" rev="2.3.1" transitive="false"/>
+ <dependency org="org.apache.uima" name="Tagger" rev="${/org.apache.uima/Tagger}" transitive="false"/>
+ <dependency org="org.apache.uima" name="WhitespaceTokenizer" rev="${/org.apache.uima/WhitespaceTokenizer}" transitive="false"/>
+ <dependency org="org.apache.uima" name="uimaj-core" rev="${/org.apache.uima/uimaj-core}" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Mon Oct 21 18:58:24 2013
@@ -89,11 +89,7 @@ public abstract class BaseUIMATokenizer
@Override
public void reset() throws IOException {
- iterator = null;
- }
-
- @Override
- public void end() throws IOException {
+ super.reset();
iterator = null;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java Mon Oct 21 18:58:24 2013
@@ -86,7 +86,7 @@ public final class UIMAAnnotationsTokeni
@Override
public void end() throws IOException {
- offsetAttr.setOffset(finalOffset, finalOffset);
super.end();
+ offsetAttr.setOffset(finalOffset, finalOffset);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java Mon Oct 21 18:58:24 2013
@@ -107,8 +107,8 @@ public final class UIMATypeAwareAnnotati
@Override
public void end() throws IOException {
- offsetAttr.setOffset(finalOffset, finalOffset);
super.end();
+ offsetAttr.setOffset(finalOffset, finalOffset);
}
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/ivy.xml?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/ivy.xml Mon Oct 21 18:58:24 2013
@@ -19,9 +19,9 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="benchmark"/>
<dependencies>
- <dependency org="org.apache.commons" name="commons-compress" rev="1.4.1" transitive="false"/>
- <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
- <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.17" transitive="false"/>
+ <dependency org="org.apache.commons" name="commons-compress" rev="${/org.apache.commons/commons-compress}" transitive="false"/>
+ <dependency org="xerces" name="xercesImpl" rev="${/xerces/xercesImpl}" transitive="false"/>
+ <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="${/net.sourceforge.nekohtml/nekohtml}" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Mon Oct 21 18:58:24 2013
@@ -90,6 +90,8 @@ public class ReadTokensTask extends Perf
termAtt.fillBytesRef();
tokenCount++;
}
+ stream.end();
+ stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Mon Oct 21 18:58:24 2013
@@ -342,7 +342,7 @@ public class TestPerfTasksLogic extends
Benchmark benchmark = execBenchmark(algLines);
DirectoryReader r = DirectoryReader.open(benchmark.getRunData().getDirectory());
- SortedDocValues idx = FieldCache.DEFAULT.getTermsIndex(new SlowCompositeReaderWrapper(r), "country");
+ SortedDocValues idx = FieldCache.DEFAULT.getTermsIndex(SlowCompositeReaderWrapper.wrap(r), "country");
final int maxDoc = r.maxDoc();
assertEquals(1000, maxDoc);
for(int i=0;i<1000;i++) {
Modified: lucene/dev/branches/lucene4956/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/build.xml?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/build.xml Mon Oct 21 18:58:24 2013
@@ -169,6 +169,11 @@
</license-check-macro>
</target>
+ <target name="check-lib-versions" depends="compile-tools,resolve,load-custom-tasks"
+ description="Verify that the '/org/name' keys in ivy-versions.properties are sorted lexically and are neither duplicates nor orphans, and that all dependencies in all ivy.xml files use rev="$${/org/name}" format.">
+ <lib-versions-check-macro dir="${common.dir}/.." centralized.versions.file="${common.dir}/ivy-versions.properties"/>
+ </target>
+
<target name="check-forbidden-apis" depends="compile-tools,compile-test,install-forbidden-apis,-forbidden-apis-classpath,-check-forbidden-jdk-apis,-check-forbidden-test-apis,-check-system-out" description="Check forbidden API calls in compiled class files"/>
<!-- TODO: Make the forbidden API checks per module! -->
@@ -283,6 +288,7 @@
<!-- codecs: problems -->
<!-- core: problems -->
<check-missing-javadocs dir="build/docs/demo" level="method"/>
+ <check-missing-javadocs dir="build/docs/expressions" level="method"/>
<!-- facet: problems -->
<!-- grouping: problems -->
<!-- highlighter: problems -->
@@ -597,6 +603,12 @@
<modules-crawl target="jar-core"/>
</target>
+ <target name="jar-src" description="create source jars for all modules">
+ <ant dir="${common.dir}/core" target="jar-src" inheritAll="false" />
+ <ant dir="${common.dir}/test-framework" target="jar-src" inheritAll="false" />
+ <modules-crawl target="jar-src"/>
+ </target>
+
<target name="get-jenkins-line-docs" unless="enwiki.exists">
<sequential>
<!-- TODO: can get .lzma instead (it's ~17% smaller) but there's no builtin ant support...? -->
Modified: lucene/dev/branches/lucene4956/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/build.xml?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/build.xml Mon Oct 21 18:58:24 2013
@@ -28,7 +28,6 @@
<path refid="base.classpath"/>
<pathelement path="${lucene-core.jar}"/>
<pathelement path="${queries.jar}"/>
- <pathelement path="${project.classpath}"/>
<pathelement location="${build.dir}/classes/java" />
</path>
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Mon Oct 21 18:58:24 2013
@@ -18,6 +18,7 @@ package org.apache.lucene.classification
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.search.Query;
import java.io.IOException;
@@ -47,4 +48,16 @@ public interface Classifier<T> {
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
throws IOException;
+ /**
+ * Train the classifier using the underlying Lucene index
+ * @param atomicReader the reader to use to access the Lucene index
+ * @param textFieldName the name of the field used to compare documents
+ * @param classFieldName the name of the field containing the class assigned to documents
+ * @param analyzer the analyzer used to tokenize / filter the unseen text
+ * @param query the query to filter which documents use for training
+ * @throws IOException If there is a low-level I/O error.
+ */
+ public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
+ throws IOException;
+
}
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java Mon Oct 21 18:58:24 2013
@@ -19,6 +19,8 @@ package org.apache.lucene.classification
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.queries.mlt.MoreLikeThis;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
@@ -43,6 +45,7 @@ public class KNearestNeighborClassifier
private String classFieldName;
private IndexSearcher indexSearcher;
private int k;
+ private Query query;
/**
* Create a {@link Classifier} using kNN algorithm
@@ -59,9 +62,18 @@ public class KNearestNeighborClassifier
@Override
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
if (mlt == null) {
- throw new IOException("You must first call Classifier#train first");
+ throw new IOException("You must first call Classifier#train");
+ }
+ Query q;
+ if (query != null) {
+ Query mltQuery = mlt.like(new StringReader(text), textFieldName);
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(query, BooleanClause.Occur.MUST);
+ bq.add(mltQuery, BooleanClause.Occur.MUST);
+ q = bq;
+ } else {
+ q = mlt.like(new StringReader(text), textFieldName);
}
- Query q = mlt.like(new StringReader(text), textFieldName);
TopDocs topDocs = indexSearcher.search(q, k);
return selectClassFromNeighbors(topDocs);
}
@@ -71,13 +83,11 @@ public class KNearestNeighborClassifier
Map<BytesRef, Integer> classCounts = new HashMap<BytesRef, Integer>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
BytesRef cl = new BytesRef(indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue());
- if (cl != null) {
- Integer count = classCounts.get(cl);
- if (count != null) {
- classCounts.put(cl, count + 1);
- } else {
- classCounts.put(cl, 1);
- }
+ Integer count = classCounts.get(cl);
+ if (count != null) {
+ classCounts.put(cl, count + 1);
+ } else {
+ classCounts.put(cl, 1);
}
}
double max = 0;
@@ -98,11 +108,20 @@ public class KNearestNeighborClassifier
*/
@Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
+ train(atomicReader, textFieldName, classFieldName, analyzer, null);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException {
this.textFieldName = textFieldName;
this.classFieldName = classFieldName;
mlt = new MoreLikeThis(atomicReader);
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(new String[]{textFieldName});
indexSearcher = new IndexSearcher(atomicReader);
+ this.query = query;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Mon Oct 21 18:58:24 2013
@@ -27,6 +27,7 @@ import org.apache.lucene.index.TermsEnum
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.search.WildcardQuery;
@@ -49,6 +50,7 @@ public class SimpleNaiveBayesClassifier
private int docsWithClassSize;
private Analyzer analyzer;
private IndexSearcher indexSearcher;
+ private Query query;
/**
* Creates a new NaiveBayes classifier.
@@ -62,7 +64,7 @@ public class SimpleNaiveBayesClassifier
* {@inheritDoc}
*/
@Override
- public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
+ public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
throws IOException {
this.atomicReader = atomicReader;
this.indexSearcher = new IndexSearcher(this.atomicReader);
@@ -70,13 +72,29 @@ public class SimpleNaiveBayesClassifier
this.classFieldName = classFieldName;
this.analyzer = analyzer;
this.docsWithClassSize = countDocsWithClass();
+ this.query = query;
+ }
+
+ @Override
+ public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
+ train(atomicReader, textFieldName, classFieldName, analyzer, null);
}
private int countDocsWithClass() throws IOException {
int docCount = MultiFields.getTerms(this.atomicReader, this.classFieldName).getDocCount();
if (docCount == -1) { // in case codec doesn't support getDocCount
TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
- indexSearcher.search(new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING))),
+ Query q;
+ if (query != null) {
+ BooleanQuery bq = new BooleanQuery();
+ WildcardQuery wq = new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING)));
+ bq.add(wq, BooleanClause.Occur.MUST);
+ bq.add(query, BooleanClause.Occur.MUST);
+ q = bq;
+ } else {
+ q = new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING)));
+ }
+ indexSearcher.search(q,
totalHitCountCollector);
docCount = totalHitCountCollector.getTotalHits();
}
@@ -85,14 +103,14 @@ public class SimpleNaiveBayesClassifier
private String[] tokenizeDoc(String doc) throws IOException {
Collection<String> result = new LinkedList<String>();
- TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
- CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
- tokenStream.reset();
- while (tokenStream.incrementToken()) {
- result.add(charTermAttribute.toString());
+ try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc)) {
+ CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ result.add(charTermAttribute.toString());
+ }
+ tokenStream.end();
}
- tokenStream.end();
- tokenStream.close();
return result.toArray(new String[result.size()]);
}
@@ -102,7 +120,7 @@ public class SimpleNaiveBayesClassifier
@Override
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
if (atomicReader == null) {
- throw new IOException("You must first call Classifier#train first");
+ throw new IOException("You must first call Classifier#train");
}
double max = 0d;
BytesRef foundClass = new BytesRef();
@@ -157,6 +175,9 @@ public class SimpleNaiveBayesClassifier
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST));
booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
+ if (query != null) {
+ booleanQuery.add(query, BooleanClause.Occur.MUST);
+ }
TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
indexSearcher.search(booleanQuery, totalHitCountCollector);
return totalHitCountCollector.getTotalHits();
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/package.html Mon Oct 21 18:58:24 2013
@@ -17,7 +17,7 @@
<html>
<body>
Uses already seen data (the indexed documents) to classify new documents.
-Currently only contains a (simplistic) Lucene based Naive Bayes classifier
-and a k-Nearest Neighbor classifier
+Currently only contains a (simplistic) Lucene based Naive Bayes classifier,
+a k-Nearest Neighbor classifier and a Perceptron based classifier
</body>
</html>
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java Mon Oct 21 18:58:24 2013
@@ -21,14 +21,20 @@ import org.apache.lucene.document.Docume
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
import org.junit.After;
import org.junit.Before;
+import java.io.IOException;
+import java.util.Random;
+
/**
* Base class for testing {@link Classifier}s
*/
@@ -40,8 +46,9 @@ public abstract class ClassificationTest
public static final BytesRef TECHNOLOGY_RESULT = new BytesRef("technology");
private RandomIndexWriter indexWriter;
- private String textFieldName;
private Directory dir;
+
+ String textFieldName;
String categoryFieldName;
String booleanFieldName;
@@ -64,83 +71,145 @@ public abstract class ClassificationTest
dir.close();
}
+ protected void checkCorrectClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName) throws Exception {
+ checkCorrectClassification(classifier, inputDoc, expectedResult, analyzer, textFieldName, classFieldName, null);
+ }
- protected void checkCorrectClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String classFieldName) throws Exception {
- SlowCompositeReaderWrapper compositeReaderWrapper = null;
+ protected void checkCorrectClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName, Query query) throws Exception {
+ AtomicReader atomicReader = null;
try {
- populateIndex(analyzer);
- compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
- classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
+ populateSampleIndex(analyzer);
+ atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
+ classifier.train(atomicReader, textFieldName, classFieldName, analyzer, query);
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
assertNotNull(classificationResult.getAssignedClass());
assertEquals("got an assigned class of " + classificationResult.getAssignedClass(), expectedResult, classificationResult.getAssignedClass());
assertTrue("got a not positive score " + classificationResult.getScore(), classificationResult.getScore() > 0);
} finally {
- if (compositeReaderWrapper != null)
- compositeReaderWrapper.close();
+ if (atomicReader != null)
+ atomicReader.close();
+ }
+ }
+
+ protected void checkPerformance(Classifier<T> classifier, Analyzer analyzer, String classFieldName) throws Exception {
+ AtomicReader atomicReader = null;
+ long trainStart = System.currentTimeMillis();
+ long trainEnd = 0l;
+ try {
+ populatePerformanceIndex(analyzer);
+ atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
+ classifier.train(atomicReader, textFieldName, classFieldName, analyzer);
+ trainEnd = System.currentTimeMillis();
+ long trainTime = trainEnd - trainStart;
+ assertTrue("training took more than 2 mins : " + trainTime / 1000 + "s", trainTime < 120000);
+ } finally {
+ if (atomicReader != null)
+ atomicReader.close();
+ }
+ }
+
+ private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
+ indexWriter.deleteAll();
+ indexWriter.commit();
+
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.setStoreTermVectors(true);
+ ft.setStoreTermVectorOffsets(true);
+ ft.setStoreTermVectorPositions(true);
+ int docs = 1000;
+ Random random = random();
+ for (int i = 0; i < docs; i++) {
+ boolean b = random.nextBoolean();
+ Document doc = new Document();
+ doc.add(new Field(textFieldName, createRandomString(random), ft));
+ doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
+ doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
+ indexWriter.addDocument(doc, analyzer);
+ }
+ indexWriter.commit();
+ }
+
+ private String createRandomString(Random random) {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < 20; i++) {
+ builder.append(_TestUtil.randomSimpleString(random, 5));
+ builder.append(" ");
}
+ return builder.toString();
}
- private void populateIndex(Analyzer analyzer) throws Exception {
+ private void populateSampleIndex(Analyzer analyzer) throws Exception {
+
+ indexWriter.deleteAll();
+ indexWriter.commit();
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
+ String text;
+
Document doc = new Document();
- doc.add(new Field(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
+ text = "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
"who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", ft));
+ "the Unknown Soldier in Warsaw Tuesday.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
- doc.add(new Field(booleanFieldName, "false", ft));
+ doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", ft));
+ text = "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
+ " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
- doc.add(new Field(booleanFieldName, "false", ft));
+ doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
+ text = "And there's a threshold question that he has to answer for the American people and " +
"that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", ft));
+ "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
- doc.add(new Field(booleanFieldName, "false", ft));
+ doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
+ text = "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
"keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", ft));
+ "Albany's School of Criminal Justice.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
- doc.add(new Field(booleanFieldName, "false", ft));
+ doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
+ text = "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
"technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", ft));
+ "world through the Internet.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
- doc.add(new Field(booleanFieldName, "true", ft));
+ doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", ft));
+ text = "So, about all those experts and analysts who've spent the past year or so saying " +
+ "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
- doc.add(new Field(booleanFieldName, "true", ft));
+ doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new Field(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
+ text = "More than 400 million people trust Google with their e-mail, and 50 million store files" +
" in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", ft));
+ "generally transfer or store huge volumes of personal data online.";
+ doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
- doc.add(new Field(booleanFieldName, "true", ft));
+ doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.commit();
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Mon Oct 21 18:58:24 2013
@@ -17,6 +17,8 @@
package org.apache.lucene.classification;
import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;
@@ -27,7 +29,17 @@ public class KNearestNeighborClassifierT
@Test
public void testBasicUsage() throws Exception {
- checkCorrectClassification(new KNearestNeighborClassifier(1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), categoryFieldName);
+ checkCorrectClassification(new KNearestNeighborClassifier(1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
+ }
+
+ @Test
+ public void testBasicUsageWithQuery() throws Exception {
+ checkCorrectClassification(new KNearestNeighborClassifier(1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName, new TermQuery(new Term(textFieldName, "it")));
+ }
+
+ @Test
+ public void testPerformance() throws Exception {
+ checkPerformance(new KNearestNeighborClassifier(100), new MockAnalyzer(random()), categoryFieldName);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Mon Oct 21 18:58:24 2013
@@ -21,11 +21,11 @@ import org.apache.lucene.analysis.MockAn
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.Version;
import org.junit.Test;
import java.io.Reader;
@@ -39,13 +39,18 @@ public class SimpleNaiveBayesClassifierT
@Test
public void testBasicUsage() throws Exception {
- checkCorrectClassification(new SimpleNaiveBayesClassifier(), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), categoryFieldName);
- checkCorrectClassification(new SimpleNaiveBayesClassifier(), POLITICS_INPUT, POLITICS_RESULT, new MockAnalyzer(random()), categoryFieldName);
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), POLITICS_INPUT, POLITICS_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
+ }
+
+ @Test
+ public void testBasicUsageWithQuery() throws Exception {
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName, new TermQuery(new Term(textFieldName, "it")));
}
@Test
public void testNGramUsage() throws Exception {
- checkCorrectClassification(new SimpleNaiveBayesClassifier(), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new NGramAnalyzer(), categoryFieldName);
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new NGramAnalyzer(), textFieldName, categoryFieldName);
}
private class NGramAnalyzer extends Analyzer {
@@ -56,4 +61,9 @@ public class SimpleNaiveBayesClassifierT
}
}
+ @Test
+ public void testPerformance() throws Exception {
+ checkPerformance(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()), categoryFieldName);
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java Mon Oct 21 18:58:24 2013
@@ -131,9 +131,15 @@ public class DataSplitterTest extends Lu
closeQuietly(testReader);
closeQuietly(cvReader);
} finally {
- trainingIndex.close();
- testIndex.close();
- crossValidationIndex.close();
+ if (trainingIndex != null) {
+ trainingIndex.close();
+ }
+ if (testIndex != null) {
+ testIndex.close();
+ }
+ if (crossValidationIndex != null) {
+ crossValidationIndex.close();
+ }
}
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java Mon Oct 21 18:58:24 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.codecs.blockte
import java.io.IOException;
import java.util.Collections;
-import java.util.Comparator;
import java.util.Iterator;
import java.util.TreeMap;
@@ -142,6 +141,7 @@ public class BlockTermsReader extends Fi
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
+ final int longsSize = version >= BlockTermsWriter.VERSION_META_ARRAY ? in.readVInt() : 0;
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
}
@@ -151,7 +151,7 @@ public class BlockTermsReader extends Fi
if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
}
- FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount));
+ FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize));
if (previous != null) {
throw new CorruptIndexException("duplicate fields: " + fieldInfo.name + " (resource=" + in + ")");
}
@@ -230,8 +230,9 @@ public class BlockTermsReader extends Fi
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
+ final int longsSize;
- FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
@@ -239,16 +240,17 @@ public class BlockTermsReader extends Fi
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
+ this.longsSize = longsSize;
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ return new SegmentTermsEnum();
}
@Override
- public TermsEnum iterator(TermsEnum reuse) throws IOException {
- return new SegmentTermsEnum();
+ public boolean hasFreqs() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
@@ -326,6 +328,10 @@ public class BlockTermsReader extends Fi
private final ByteArrayDataInput freqReader = new ByteArrayDataInput();
private int metaDataUpto;
+ private long[] longs;
+ private byte[] bytes;
+ private ByteArrayDataInput bytesReader;
+
public SegmentTermsEnum() throws IOException {
in = BlockTermsReader.this.in.clone();
in.seek(termsStartPointer);
@@ -339,11 +345,7 @@ public class BlockTermsReader extends Fi
termSuffixes = new byte[128];
docFreqBytes = new byte[64];
//System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
+ longs = new long[longsSize];
}
// TODO: we may want an alternate mode here which is
@@ -415,7 +417,7 @@ public class BlockTermsReader extends Fi
assert result;
indexIsCurrent = true;
- didIndexNext = false;
+ didIndexNext = false;
if (doOrd) {
state.ord = indexEnum.ord()-1;
@@ -789,11 +791,20 @@ public class BlockTermsReader extends Fi
//System.out.println(" freq bytes len=" + len);
in.readBytes(docFreqBytes, 0, len);
freqReader.reset(docFreqBytes, 0, len);
- metaDataUpto = 0;
- state.termBlockOrd = 0;
+ // metadata
+ len = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(len, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < len) {
+ bytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ in.readBytes(bytes, 0, len);
+ bytesReader.reset(bytes, 0, len);
- postingsReader.readTermsBlock(in, fieldInfo, state);
+ metaDataUpto = 0;
+ state.termBlockOrd = 0;
indexIsCurrent = false;
//System.out.println(" indexIsCurrent=" + indexIsCurrent);
@@ -811,9 +822,7 @@ public class BlockTermsReader extends Fi
// lazily catch up on metadata decode:
final int limit = state.termBlockOrd;
- // We must set/incr state.termCount because
- // postings impl can look at this
- state.termBlockOrd = metaDataUpto;
+ boolean absolute = metaDataUpto == 0;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
//System.out.println(" decode mdUpto=" + metaDataUpto);
@@ -825,16 +834,21 @@ public class BlockTermsReader extends Fi
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // docFreq, totalTermFreq
state.docFreq = freqReader.readVInt();
//System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + freqReader.readVLong();
//System.out.println(" totTF=" + state.totalTermFreq);
}
-
- postingsReader.nextTerm(fieldInfo, state);
+ // metadata
+ for (int i = 0; i < longs.length; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
metaDataUpto++;
- state.termBlockOrd++;
+ absolute = false;
}
} else {
//System.out.println(" skip! seekPending");
@@ -842,4 +856,11 @@ public class BlockTermsReader extends Fi
}
}
}
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInBytes = (postingsReader!=null) ? postingsReader.ramBytesUsed() : 0;
+ sizeInBytes += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
+ return sizeInBytes;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java Mon Oct 21 18:58:24 2013
@@ -17,26 +17,29 @@ package org.apache.lucene.codecs.blockte
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Comparator;
import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
-import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@@ -52,14 +55,15 @@ import org.apache.lucene.util.RamUsageEs
* @lucene.experimental
*/
-public class BlockTermsWriter extends FieldsConsumer {
+public class BlockTermsWriter extends FieldsConsumer implements Closeable {
final static String CODEC_NAME = "BLOCK_TERMS_DICT";
// Initial format
public static final int VERSION_START = 0;
public static final int VERSION_APPEND_ONLY = 1;
- public static final int VERSION_CURRENT = VERSION_APPEND_ONLY;
+ public static final int VERSION_META_ARRAY = 2;
+ public static final int VERSION_CURRENT = VERSION_META_ARRAY;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tib";
@@ -69,6 +73,7 @@ public class BlockTermsWriter extends Fi
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
+ private final int maxDoc;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -77,8 +82,9 @@ public class BlockTermsWriter extends Fi
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
+ public final int longsSize;
- public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) {
+ public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.termsStartPointer = termsStartPointer;
@@ -86,6 +92,7 @@ public class BlockTermsWriter extends Fi
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
+ this.longsSize = longsSize;
}
}
@@ -98,6 +105,7 @@ public class BlockTermsWriter extends Fi
throws IOException {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter;
+ maxDoc = state.segmentInfo.getDocCount();
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
try {
@@ -109,7 +117,7 @@ public class BlockTermsWriter extends Fi
//System.out.println("BTW.init seg=" + state.segmentName);
- postingsWriter.start(out); // have consumer write its format/header
+ postingsWriter.init(out); // have consumer write its format/header
success = true;
} finally {
if (!success) {
@@ -123,7 +131,43 @@ public class BlockTermsWriter extends Fi
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
+ public void write(Fields fields) throws IOException {
+
+ boolean success = false;
+ try {
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ private TermsWriter addField(FieldInfo field) throws IOException {
//System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
@@ -131,11 +175,8 @@ public class BlockTermsWriter extends Fi
return new TermsWriter(fieldIndexWriter, field, postingsWriter);
}
- @Override
public void close() throws IOException {
-
try {
-
final long dirStart = out.getFilePointer();
out.writeVInt(fields.size());
@@ -148,6 +189,9 @@ public class BlockTermsWriter extends Fi
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
+ if (VERSION_CURRENT >= VERSION_META_ARRAY) {
+ out.writeVInt(field.longsSize);
+ }
}
writeTrailer(dirStart);
} finally {
@@ -161,18 +205,20 @@ public class BlockTermsWriter extends Fi
private static class TermEntry {
public final BytesRef term = new BytesRef();
- public TermStats stats;
+ public BlockTermState state;
}
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final PostingsWriterBase postingsWriter;
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ private final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
+ int longsSize;
private TermEntry[] pendingTerms;
@@ -185,35 +231,32 @@ public class BlockTermsWriter extends Fi
{
this.fieldInfo = fieldInfo;
this.fieldIndexWriter = fieldIndexWriter;
+ this.docsSeen = new FixedBitSet(maxDoc);
pendingTerms = new TermEntry[32];
for(int i=0;i<pendingTerms.length;i++) {
pendingTerms[i] = new TermEntry();
}
termsStartPointer = out.getFilePointer();
- postingsWriter.setField(fieldInfo);
this.postingsWriter = postingsWriter;
+ this.longsSize = postingsWriter.setField(fieldInfo);
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- //System.out.println("BTW: startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
private final BytesRef lastPrevTerm = new BytesRef();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ void write(BytesRef text, TermsEnum termsEnum) throws IOException {
- assert stats.docFreq > 0;
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state == null) {
+ // No docs for this term:
+ return;
+ }
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
+
+ assert state.docFreq > 0;
//System.out.println("BTW: finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
+ TermStats stats = new TermStats(state.docFreq, state.totalTermFreq);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
if (isIndexTerm) {
@@ -237,17 +280,14 @@ public class BlockTermsWriter extends Fi
}
final TermEntry te = pendingTerms[pendingCount];
te.term.copyBytes(text);
- te.stats = stats;
+ te.state = state;
pendingCount++;
-
- postingsWriter.finishTerm(stats);
numTerms++;
}
// Finishes all terms in this field
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+ void finish() throws IOException {
if (pendingCount > 0) {
flushBlock();
}
@@ -262,9 +302,10 @@ public class BlockTermsWriter extends Fi
fields.add(new FieldMetaData(fieldInfo,
numTerms,
termsStartPointer,
- sumTotalTermFreq,
+ fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 ? sumTotalTermFreq : -1,
sumDocFreq,
- docCount));
+ docsSeen.cardinality(),
+ longsSize));
}
}
@@ -285,6 +326,7 @@ public class BlockTermsWriter extends Fi
}
private final RAMOutputStream bytesWriter = new RAMOutputStream();
+ private final RAMOutputStream bufferWriter = new RAMOutputStream();
private void flushBlock() throws IOException {
//System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
@@ -318,19 +360,34 @@ public class BlockTermsWriter extends Fi
// TODO: cutover to better intblock codec. simple64?
// write prefix, suffix first:
for(int termCount=0;termCount<pendingCount;termCount++) {
- final TermStats stats = pendingTerms[termCount].stats;
- assert stats != null;
- bytesWriter.writeVInt(stats.docFreq);
+ final BlockTermState state = pendingTerms[termCount].state;
+ assert state != null;
+ bytesWriter.writeVInt(state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
- bytesWriter.writeVLong(stats.totalTermFreq-stats.docFreq);
+ bytesWriter.writeVLong(state.totalTermFreq-state.docFreq);
}
}
+ out.writeVInt((int) bytesWriter.getFilePointer());
+ bytesWriter.writeTo(out);
+ bytesWriter.reset();
+ // 4th pass: write the metadata
+ long[] longs = new long[longsSize];
+ boolean absolute = true;
+ for(int termCount=0;termCount<pendingCount;termCount++) {
+ final BlockTermState state = pendingTerms[termCount].state;
+ postingsWriter.encodeTerm(longs, bufferWriter, fieldInfo, state, absolute);
+ for (int i = 0; i < longsSize; i++) {
+ bytesWriter.writeVLong(longs[i]);
+ }
+ bufferWriter.writeTo(bytesWriter);
+ bufferWriter.reset();
+ absolute = false;
+ }
out.writeVInt((int) bytesWriter.getFilePointer());
bytesWriter.writeTo(out);
bytesWriter.reset();
- postingsWriter.flushTermsBlock(pendingCount, pendingCount);
lastPrevTerm.copyBytes(pendingTerms[pendingCount-1].term);
pendingCount = 0;
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java Mon Oct 21 18:58:24 2013
@@ -256,6 +256,12 @@ public class FixedGapTermsIndexReader ex
clone.close();
}
}
+
+ /** Returns approximate RAM bytes used */
+ public long ramBytesUsed() {
+ return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) +
+ ((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0);
+ }
}
@Override
@@ -271,4 +277,15 @@ public class FixedGapTermsIndexReader ex
dirOffset = input.readLong();
input.seek(dirOffset);
}
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInBytes = ((termBytes!=null) ? termBytes.ramBytesUsed() : 0) +
+ ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
+
+ for(FieldIndexData entry : fields.values()) {
+ sizeInBytes += entry.ramBytesUsed();
+ }
+ return sizeInBytes;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java Mon Oct 21 18:58:24 2013
@@ -70,4 +70,7 @@ public abstract class TermsIndexReaderBa
/** Only implemented if {@link TermsIndexReaderBase#supportsOrd()} returns true. */
public abstract long ord();
}
+
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java Mon Oct 21 18:58:24 2013
@@ -169,6 +169,11 @@ public class VariableGapTermsIndexReader
w.close();
*/
}
+
+ /** Returns approximate RAM bytes used */
+ public long ramBytesUsed() {
+ return fst == null ? 0 : fst.sizeInBytes();
+ }
}
@Override
@@ -191,4 +196,13 @@ public class VariableGapTermsIndexReader
}
input.seek(dirOffset);
}
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInBytes = 0;
+ for(FieldIndexData entry : fields.values()) {
+ sizeInBytes += entry.ramBytesUsed();
+ }
+ return sizeInBytes;
+ }
}