You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC
svn commit: r1512909 [5/38] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/
dev-tools/idea/solr/core/src/test/ dev-tool...
Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.Reader;
-import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@@ -54,7 +53,7 @@ public class TestExtendedMode extends Ba
int numIterations = atLeast(1000);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomUnicodeString(random(), 100);
- TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+ TokenStream ts = analyzer.tokenStream("foo", s);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ja;
*/
import java.io.IOException;
-import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@@ -151,7 +150,7 @@ public class TestJapaneseAnalyzer extend
Mode.SEARCH,
JapaneseAnalyzer.getDefaultStopSet(),
JapaneseAnalyzer.getDefaultStopTags());
- assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
+ assertTokenStreamContents(a.tokenStream("foo", "abcd"),
new String[] { "a", "b", "cd" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 4 },
Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java Sun Aug 11 12:19:13 2013
@@ -22,7 +22,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
-import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@@ -142,7 +141,7 @@ public class TestJapaneseTokenizer exten
* ideally the test would actually fail instead of hanging...
*/
public void testDecomposition5() throws Exception {
- TokenStream ts = analyzer.tokenStream("bogus", new StringReader("ãããããããããããããããããããããããããããããããããããããããã"));
+ TokenStream ts = analyzer.tokenStream("bogus", "ãããããããããããããããããããããããããããããããããããããããã");
ts.reset();
while (ts.incrementToken()) {
@@ -166,8 +165,8 @@ public class TestJapaneseTokenizer exten
/** Tests that sentence offset is incorporated into the resulting offsets */
public void testTwoSentences() throws Exception {
/*
- //TokenStream ts = a.tokenStream("foo", new StringReader("妹ã®å²åã§ãã俺ã¨å¹´åã§ãä»åé¨çã§ãã"));
- TokenStream ts = analyzer.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+ //TokenStream ts = a.tokenStream("foo", "妹ã®å²åã§ãã俺ã¨å¹´åã§ãä»åé¨çã§ãã");
+ TokenStream ts = analyzer.tokenStream("foo", "�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;");
ts.reset();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while(ts.incrementToken()) {
@@ -214,7 +213,7 @@ public class TestJapaneseTokenizer exten
public void testLargeDocReliability() throws Exception {
for (int i = 0; i < 100; i++) {
String s = _TestUtil.randomUnicodeString(random(), 10000);
- TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+ TokenStream ts = analyzer.tokenStream("foo", s);
ts.reset();
while (ts.incrementToken()) {
}
@@ -235,7 +234,7 @@ public class TestJapaneseTokenizer exten
System.out.println("\nTEST: iter=" + i);
}
String s = _TestUtil.randomUnicodeString(random(), 100);
- TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+ TokenStream ts = analyzer.tokenStream("foo", s);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
@@ -245,14 +244,14 @@ public class TestJapaneseTokenizer exten
}
public void testOnlyPunctuation() throws IOException {
- TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("ãããã"));
+ TokenStream ts = analyzerNoPunct.tokenStream("foo", "ãããã");
ts.reset();
assertFalse(ts.incrementToken());
ts.end();
}
public void testOnlyPunctuationExtended() throws IOException {
- TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+ TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......");
ts.reset();
assertFalse(ts.incrementToken());
ts.end();
@@ -261,14 +260,14 @@ public class TestJapaneseTokenizer exten
// note: test is kinda silly since kuromoji emits punctuation tokens.
// but, when/if we filter these out it will be useful.
public void testEnd() throws Exception {
- assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã")),
+ assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "ããã¯æ¬ã§ã¯ãªã"),
new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 },
new Integer(8)
);
- assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã ")),
+ assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "ããã¯æ¬ã§ã¯ãªã "),
new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
new int[] { 0, 2, 3, 4, 5, 6, 8 },
new int[] { 2, 3, 4, 5, 6, 8, 9 },
@@ -279,7 +278,7 @@ public class TestJapaneseTokenizer exten
public void testUserDict() throws Exception {
// Not a great test because w/o userdict.txt the
// segmentation is the same:
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("é¢è¥¿å½é空港ã«è¡ã£ã")),
+ assertTokenStreamContents(analyzer.tokenStream("foo", "é¢è¥¿å½é空港ã«è¡ã£ã"),
new String[] { "é¢è¥¿", "å½é", "空港", "ã«", "è¡ã£", "ã" },
new int[] { 0, 2, 4, 6, 7, 9 },
new int[] { 2, 4, 6, 7, 9, 10 },
@@ -289,7 +288,7 @@ public class TestJapaneseTokenizer exten
public void testUserDict2() throws Exception {
// Better test: w/o userdict the segmentation is different:
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("æéé¾")),
+ assertTokenStreamContents(analyzer.tokenStream("foo", "æéé¾"),
new String[] { "æéé¾" },
new int[] { 0 },
new int[] { 3 },
@@ -299,7 +298,7 @@ public class TestJapaneseTokenizer exten
public void testUserDict3() throws Exception {
// Test entry that breaks into multiple tokens:
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+ assertTokenStreamContents(analyzer.tokenStream("foo", "abcd"),
new String[] { "a", "b", "cd" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 4 },
@@ -315,7 +314,7 @@ public class TestJapaneseTokenizer exten
/*
public void testUserDict4() throws Exception {
// Test entry that has another entry as prefix
- assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+ assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"),
new String[] { "ab", "cd", "efg", "hij" },
new int[] { 0, 2, 4, 7 },
new int[] { 2, 4, 7, 10 },
@@ -366,7 +365,7 @@ public class TestJapaneseTokenizer exten
}
private void assertReadings(String input, String... readings) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
ts.reset();
for(String reading : readings) {
@@ -378,7 +377,7 @@ public class TestJapaneseTokenizer exten
}
private void assertPronunciations(String input, String... pronunciations) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
ts.reset();
for(String pronunciation : pronunciations) {
@@ -390,7 +389,7 @@ public class TestJapaneseTokenizer exten
}
private void assertBaseForms(String input, String... baseForms) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
ts.reset();
for(String baseForm : baseForms) {
@@ -402,7 +401,7 @@ public class TestJapaneseTokenizer exten
}
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
ts.reset();
for(String inflectionType : inflectionTypes) {
@@ -414,7 +413,7 @@ public class TestJapaneseTokenizer exten
}
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
ts.reset();
for(String inflectionForm : inflectionForms) {
@@ -426,7 +425,7 @@ public class TestJapaneseTokenizer exten
}
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
- TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+ TokenStream ts = analyzer.tokenStream("ignored", input);
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
ts.reset();
for(String partOfSpeech : partsOfSpeech) {
@@ -619,7 +618,7 @@ public class TestJapaneseTokenizer exten
if (numIterations > 1) {
// warmup
for (int i = 0; i < numIterations; i++) {
- final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+ final TokenStream ts = analyzer.tokenStream("ignored", line);
ts.reset();
while(ts.incrementToken());
}
@@ -628,7 +627,7 @@ public class TestJapaneseTokenizer exten
long totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
- final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+ final TokenStream ts = analyzer.tokenStream("ignored", line);
ts.reset();
while(ts.incrementToken());
}
@@ -640,7 +639,7 @@ public class TestJapaneseTokenizer exten
totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
for (String sentence: sentences) {
- final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+ final TokenStream ts = analyzer.tokenStream("ignored", sentence);
ts.reset();
while(ts.incrementToken());
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java Sun Aug 11 12:19:13 2013
@@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder
System.out.println(" encode...");
- PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
+ PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
IntsRef scratch = new IntsRef();
long ord = -1; // first ord will be 0
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml Sun Aug 11 12:19:13 2013
@@ -19,9 +19,9 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
<dependencies>
- <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.5" transitive="false"/>
- <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.5" transitive="false"/>
- <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.5" transitive="false"/>
+ <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
+ <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
+ <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -26,38 +26,21 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
/**
* {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikAnalyzer extends Analyzer {
-
- private final DICTIONARY dictionary;
private final Version version;
/**
- * Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
- *
- * @param vers
- * lucene compatibility version
- * @param dict
- * A constant specifying which dictionary to choose. See the
- * Morfologik documentation for details or use the default.
- */
- public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
- this.version = vers;
- this.dictionary = dict;
- }
-
- /**
- * Builds an analyzer for an original MORFOLOGIK dictionary.
+ * Builds an analyzer with the default Morfologik's dictionary (polimorf).
*
- * @param vers lucene compatibility version
+ * @param version
+ * Lucene compatibility version
*/
- public MorfologikAnalyzer(final Version vers) {
- this(vers, DICTIONARY.MORFOLOGIK);
+ public MorfologikAnalyzer(final Version version) {
+ this.version = version;
}
/**
@@ -78,7 +61,7 @@ public class MorfologikAnalyzer extends
final Tokenizer src = new StandardTokenizer(this.version, reader);
return new TokenStreamComponents(
- src,
- new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
+ src,
+ new MorfologikFilter(new StandardFilter(this.version, src), this.version));
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Sun Aug 11 12:19:13 2013
@@ -20,22 +20,24 @@ package org.apache.lucene.analysis.morfo
import java.io.IOException;
import java.util.*;
+import java.util.regex.Pattern;
import morfologik.stemming.*;
-import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*;
/**
- * {@link TokenFilter} using Morfologik library.
+ * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
+ * morphosyntactic (POS) tokens. Applies to Polish only.
*
- * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
- * annotations for produced lemmas. See the Morfologik documentation for details.
+ * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
+ * annotations for produced lemmas. See the Morfologik documentation for details.</p>
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
@@ -44,6 +46,7 @@ public class MorfologikFilter extends To
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils;
@@ -58,13 +61,11 @@ public class MorfologikFilter extends To
private int lemmaListIndex;
/**
- * Builds a filter for given PolishStemmer.DICTIONARY enum.
- *
+ * Creates MorfologikFilter
* @param in input token stream
- * @param dict PolishStemmer.DICTIONARY enum
* @param version Lucene version compatibility for lowercasing.
*/
- public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
+ public MorfologikFilter(final TokenStream in, final Version version) {
super(in);
this.input = in;
@@ -73,7 +74,7 @@ public class MorfologikFilter extends To
ClassLoader cl = me.getContextClassLoader();
try {
me.setContextClassLoader(PolishStemmer.class.getClassLoader());
- this.stemmer = new PolishStemmer(dict);
+ this.stemmer = new PolishStemmer();
this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList();
} finally {
@@ -81,44 +82,30 @@ public class MorfologikFilter extends To
}
}
+ /**
+ * A pattern used to split lemma forms.
+ */
+ private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
+
private void popNextLemma() {
- // Collect all tags for the next unique lemma.
- CharSequence currentStem;
- int tags = 0;
- do {
- final WordData lemma = lemmaList.get(lemmaListIndex++);
- currentStem = lemma.getStem();
- final CharSequence tag = lemma.getTag();
- if (tag != null) {
- if (tagsList.size() <= tags) {
+ // One tag (concatenated) per lemma.
+ final WordData lemma = lemmaList.get(lemmaListIndex++);
+ termAtt.setEmpty().append(lemma.getStem());
+ CharSequence tag = lemma.getTag();
+ if (tag != null) {
+ String[] tags = lemmaSplitter.split(tag.toString());
+ for (int i = 0; i < tags.length; i++) {
+ if (tagsList.size() <= i) {
tagsList.add(new StringBuilder());
}
-
- final StringBuilder buffer = tagsList.get(tags++);
+ StringBuilder buffer = tagsList.get(i);
buffer.setLength(0);
- buffer.append(lemma.getTag());
- }
- } while (lemmaListIndex < lemmaList.size() &&
- equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
-
- // Set the lemma's base form and tags as attributes.
- termAtt.setEmpty().append(currentStem);
- tagsAtt.setTags(tagsList.subList(0, tags));
- }
-
- /**
- * Compare two char sequences for equality. Assumes non-null arguments.
- */
- private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
- int len1 = s1.length();
- int len2 = s2.length();
- if (len1 != len2) return false;
- for (int i = len1; --i >= 0;) {
- if (s1.charAt(i) != s2.charAt(i)) {
- return false;
+ buffer.append(tags[i]);
}
+ tagsAtt.setTags(tagsList.subList(0, tags.length));
+ } else {
+ tagsAtt.setTags(Collections.<StringBuilder> emptyList());
}
- return true;
}
/**
@@ -140,7 +127,8 @@ public class MorfologikFilter extends To
popNextLemma();
return true;
} else if (this.input.incrementToken()) {
- if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+ if (!keywordAttr.isKeyword() &&
+ (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
current = captureState();
popNextLemma();
} else {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -17,12 +17,8 @@ package org.apache.lucene.analysis.morfo
* limitations under the License.
*/
-import java.util.Arrays;
-import java.util.Locale;
import java.util.Map;
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -32,39 +28,28 @@ import org.apache.lucene.analysis.util.T
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+ * <filter class="solr.MorfologikFilterFactory" />
* </analyzer>
* </fieldType></pre>
*
- * <p>Any of Morfologik dictionaries can be used, these are at the moment:
- * <code>MORFOLOGIK</code> (Morfologik's original dictionary),
- * <code>MORFEUSZ</code> (Morfeusz-SIAT),
- * <code>COMBINED</code> (both of the dictionaries above, combined).
- *
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
public class MorfologikFilterFactory extends TokenFilterFactory {
- /** Dictionary. */
- private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK;
-
/** Schema attribute. */
+ @Deprecated
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
-
+
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
+
+ // Be specific about no-longer-supported dictionary attribute.
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
if (dictionaryName != null && !dictionaryName.isEmpty()) {
- try {
- DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT));
- assert dictionary != null;
- this.dictionary = dictionary;
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the "
- + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: "
- + dictionaryName);
- }
+ throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
+ + "longer supported (Morfologik has one dictionary): " + dictionaryName);
}
+
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -72,6 +57,6 @@ public class MorfologikFilterFactory ext
@Override
public TokenStream create(TokenStream ts) {
- return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
+ return new MorfologikFilter(ts, luceneMatchVersion);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java Sun Aug 11 12:19:13 2013
@@ -23,9 +23,9 @@ import java.util.List;
import org.apache.lucene.util.Attribute;
/**
- * Morfologik dictionaries provide morphosyntactic annotations for
+ * Morfologik provides morphosyntactic annotations for
* surface forms. For the exact format and description of these,
- * see the project's documentation (annotations vary by dictionary!).
+ * see the project's documentation.
*/
public interface MorphosyntacticTagsAttribute extends Attribute {
/**
@@ -36,7 +36,9 @@ public interface MorphosyntacticTagsAttr
public void setTags(List<StringBuilder> tags);
/**
- * Returns the POS tag of the term.
+ * Returns the POS tag of the term. A single word may have multiple POS tags,
+ * depending on the interpretation (context disambiguation is typically needed
+ * to determine which particular tag is appropriate).
*/
public List<StringBuilder> getTags();
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,11 +18,19 @@ package org.apache.lucene.analysis.morfo
*/
import java.io.IOException;
-import java.io.StringReader;
+import java.io.Reader;
import java.util.TreeSet;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
@@ -56,16 +64,28 @@ public class TestMorfologikAnalyzer exte
assertAnalyzesToReuse(
a,
"T. Gl\u00FCcksberg",
- new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
- new int[] { 0, 0, 0, 3 },
- new int[] { 1, 1, 1, 13 },
- new int[] { 1, 0, 0, 1 });
+ new String[] { "tom", "tona", "Gl\u00FCcksberg" },
+ new int[] { 0, 0, 3 },
+ new int[] { 1, 1, 13 },
+ new int[] { 1, 0, 1 });
+ }
+
+ @SuppressWarnings("unused")
+ private void dumpTokens(String input) throws IOException {
+ TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
+ ts.reset();
+
+ MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
+ CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()) {
+ System.out.println(charTerm.toString() + " => " + attribute.getTags());
+ }
}
/** Test reuse of MorfologikFilter with leftover stems. */
public final void testLeftoverStems() throws IOException {
Analyzer a = getTestAnalyzer();
- TokenStream ts_1 = a.tokenStream("dummy", new StringReader("liÅcie"));
+ TokenStream ts_1 = a.tokenStream("dummy", "liÅcie");
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
ts_1.reset();
ts_1.incrementToken();
@@ -73,7 +93,7 @@ public class TestMorfologikAnalyzer exte
ts_1.end();
ts_1.close();
- TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
+ TokenStream ts_2 = a.tokenStream("dummy", "danych");
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
ts_2.reset();
ts_2.incrementToken();
@@ -120,7 +140,7 @@ public class TestMorfologikAnalyzer exte
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
- TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liÅcie"));
+ TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liÅcie");
ts.reset();
assertPOSToken(ts, "liÅcie",
@@ -144,6 +164,34 @@ public class TestMorfologikAnalyzer exte
ts.close();
}
+ /** */
+ public final void testKeywordAttrTokens() throws IOException {
+ final Version version = TEST_VERSION_CURRENT;
+
+ Analyzer a = new MorfologikAnalyzer(version) {
+ @Override
+ protected TokenStreamComponents createComponents(String field, Reader reader) {
+ final CharArraySet keywords = new CharArraySet(version, 1, false);
+ keywords.add("liÅcie");
+
+ final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
+ result = new SetKeywordMarkerFilter(result, keywords);
+ result = new MorfologikFilter(result, TEST_VERSION_CURRENT);
+
+ return new TokenStreamComponents(src, result);
+ }
+ };
+
+ assertAnalyzesToReuse(
+ a,
+ "liÅcie danych",
+ new String[] { "liÅcie", "dany", "dana", "dane", "daÄ" },
+ new int[] { 0, 7, 7, 7, 7 },
+ new int[] { 6, 13, 13, 13, 13 },
+ new int[] { 1, 1, 0, 0, 0 });
+ }
+
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -18,8 +18,8 @@ package org.apache.lucene.analysis.morfo
*/
import java.io.StringReader;
+import java.util.Collections;
import java.util.HashMap;
-import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
@@ -31,10 +31,7 @@ import org.apache.lucene.analysis.TokenS
public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
public void testCreateDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
- Map<String,String> initParams = new HashMap<String,String>();
- initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
- "morfologik");
- MorfologikFilterFactory factory = new MorfologikFilterFactory(initParams);
+ MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Sun Aug 11 12:19:13 2013
@@ -20,8 +20,11 @@ package org.apache.lucene.analysis.cn.sm
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.util.Properties;
+import org.apache.lucene.util.IOUtils;
+
/**
* Manages analysis data configuration for SmartChineseAnalyzer
* <p>
@@ -77,13 +80,13 @@ public class AnalyzerProfile {
Properties prop = new Properties();
try {
FileInputStream input = new FileInputStream(propFile);
- prop.load(input);
+ prop.load(new InputStreamReader(input, IOUtils.CHARSET_UTF_8));
String dir = prop.getProperty("analysis.data.dir", "");
input.close();
return dir;
} catch (IOException e) {
+ return "";
}
- return "";
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cn.sm
import java.io.IOException;
import java.io.Reader;
-import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -185,7 +184,7 @@ public class TestSmartChineseAnalyzer ex
sb.append("æè´ä¹°äºéå
·åæè£
ã");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
- TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+ TokenStream stream = analyzer.tokenStream("", sb.toString());
stream.reset();
while (stream.incrementToken()) {
}
@@ -198,7 +197,7 @@ public class TestSmartChineseAnalyzer ex
sb.append("æè´ä¹°äºéå
·åæè£
");
}
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
- TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+ TokenStream stream = analyzer.tokenStream("", sb.toString());
stream.reset();
while (stream.incrementToken()) {
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -35,7 +35,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
@@ -62,7 +61,7 @@ public class UIMABaseAnalyzerTest extend
@Test
public void baseUIMAAnalyzerStreamTest() throws Exception {
- TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+ TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
assertTokenStreamContents(ts, new String[]{"the", "big", "brown", "fox", "jumped", "on", "the", "wood"});
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -23,8 +23,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import java.io.StringReader;
-
/**
* Testcase for {@link UIMATypeAwareAnalyzer}
*/
@@ -51,7 +49,7 @@ public class UIMATypeAwareAnalyzerTest e
public void baseUIMATypeAwareAnalyzerStreamTest() throws Exception {
// create a token stream
- TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+ TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
// check that 'the big brown fox jumped on the wood' tokens have the expected PoS types
assertTokenStreamContents(ts,
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Sun Aug 11 12:19:13 2013
@@ -119,14 +119,9 @@ public class CreateIndexTask extends Per
if (mergeScheduler.equals("org.apache.lucene.index.ConcurrentMergeScheduler")) {
ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwConf.getMergeScheduler();
- int v = config.get("concurrent.merge.scheduler.max.thread.count", -1);
- if (v != -1) {
- cms.setMaxThreadCount(v);
- }
- v = config.get("concurrent.merge.scheduler.max.merge.count", -1);
- if (v != -1) {
- cms.setMaxMergeCount(v);
- }
+ int maxThreadCount = config.get("concurrent.merge.scheduler.max.thread.count", ConcurrentMergeScheduler.DEFAULT_MAX_THREAD_COUNT);
+ int maxMergeCount = config.get("concurrent.merge.scheduler.max.merge.count", ConcurrentMergeScheduler.DEFAULT_MAX_MERGE_COUNT);
+ cms.setMaxMergesAndThreads(maxMergeCount, maxThreadCount);
}
}
@@ -151,13 +146,10 @@ public class CreateIndexTask extends Per
} catch (Exception e) {
throw new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy", e);
}
+ iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
if(iwConf.getMergePolicy() instanceof LogMergePolicy) {
LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
- logMergePolicy.setUseCompoundFile(isCompound);
logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
- } else if(iwConf.getMergePolicy() instanceof TieredMergePolicy) {
- TieredMergePolicy tieredMergePolicy = (TieredMergePolicy) iwConf.getMergePolicy();
- tieredMergePolicy.setUseCompoundFile(isCompound);
}
}
final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java Sun Aug 11 12:19:13 2013
@@ -18,9 +18,9 @@ package org.apache.lucene.benchmark.byTa
*/
import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -80,8 +80,7 @@ public class Config {
}
// read props from string
this.props = new Properties();
- // props.load always assumes iso8859-1...
- props.load(new ByteArrayInputStream(sb.toString().getBytes("ISO-8859-1")));
+ props.load(new StringReader(sb.toString()));
// make sure work dir is set properly
if (props.get("work.dir") == null) {
Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Sun Aug 11 12:19:13 2013
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
-import java.io.StringReader;
import java.text.Collator;
import java.util.List;
import java.util.Locale;
@@ -49,6 +48,7 @@ import org.apache.lucene.index.IndexWrit
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SerialMergeScheduler;
@@ -754,7 +754,7 @@ public class TestPerfTasksLogic extends
assertEquals(2, writer.getConfig().getMaxBufferedDocs());
assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
- assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
+ assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
writer.close();
Directory dir = benchmark.getRunData().getDirectory();
IndexReader reader = DirectoryReader.open(dir);
@@ -978,8 +978,8 @@ public class TestPerfTasksLogic extends
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
throws Exception {
- TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
- TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
+ TokenStream ts1 = a1.tokenStream("bogus", text);
+ TokenStream ts2 = a2.tokenStream("bogus", text);
ts1.reset();
ts2.reset();
TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
@@ -1029,7 +1029,7 @@ public class TestPerfTasksLogic extends
Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
benchmark.getRunData().getAnalyzer().tokenStream
- ("bogus", new StringReader(text)).close();
+ ("bogus", text).close();
BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
new String[] { "one", "one two", "two", "two three",
"three", "three four", "four", "four five",
Modified: lucene/dev/branches/lucene4956/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/build.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/build.xml Sun Aug 11 12:19:13 2013
@@ -183,7 +183,9 @@
<forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
<bundledSignatures name="jdk-unsafe-${javac.target}"/>
<bundledSignatures name="jdk-deprecated-${javac.target}"/>
- <signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
+ <signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
+ <include name="base.txt" />
+ </signaturesFileSet>
<fileset dir="${basedir}/build" includes="**/*.class" />
</forbidden-apis>
</target>
@@ -345,7 +347,7 @@
</target>
<!-- rat-sources-typedef is *not* a useless dependency. do not remove -->
- <target name="rat-sources" depends="rat-sources-typedef">
+ <target name="rat-sources" depends="rat-sources-typedef,common.rat-sources">
<subant target="rat-sources" failonerror="true" inheritall="false">
<propertyset refid="uptodate.and.compiled.properties"/>
<fileset dir="core" includes="build.xml"/>
@@ -608,4 +610,13 @@
<jar-checksum-macro srcdir="${common.dir}" dstdir="${common.dir}/licenses"/>
</target>
+ <target name="regenerate">
+ <subant target="regenerate" failonerror="true" inheritall="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ <fileset dir="core" includes="build.xml"/>
+ <fileset dir="test-framework" includes="build.xml"/>
+ </subant>
+ <modules-crawl target="regenerate"/>
+ </target>
+
</project>
Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Sun Aug 11 12:19:13 2013
@@ -33,7 +33,6 @@ import org.apache.lucene.search.Wildcard
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
-import java.io.StringReader;
import java.util.Collection;
import java.util.LinkedList;
@@ -86,7 +85,7 @@ public class SimpleNaiveBayesClassifier
private String[] tokenizeDoc(String doc) throws IOException {
Collection<String> result = new LinkedList<String>();
- TokenStream tokenStream = analyzer.tokenStream(textFieldName, new StringReader(doc));
+ TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java Sun Aug 11 12:19:13 2013
@@ -69,9 +69,6 @@ public class BlockTermsReader extends Fi
private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
- // Caches the most recently looked-up field + terms:
- private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
-
// Reads the terms index
private TermsIndexReaderBase indexReader;
@@ -113,11 +110,10 @@ public class BlockTermsReader extends Fi
// private String segment;
public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, SegmentInfo info, PostingsReaderBase postingsReader, IOContext context,
- int termsCacheSize, String segmentSuffix)
+ String segmentSuffix)
throws IOException {
this.postingsReader = postingsReader;
- termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
// this.segment = segment;
in = dir.openInput(IndexFileNames.segmentFileName(info.name, segmentSuffix, BlockTermsWriter.TERMS_EXTENSION),
@@ -317,11 +313,6 @@ public class BlockTermsReader extends Fi
calls next() (which is not "typical"), then we'll do the real seek */
private boolean seekPending;
- /* How many blocks we've read since last seek. Once this
- is >= indexEnum.getDivisor() we set indexIsCurrent to false (since
- the index can no long bracket seek-within-block). */
- private int blocksSinceSeek;
-
private byte[] termSuffixes;
private ByteArrayDataInput termSuffixesReader = new ByteArrayDataInput();
@@ -362,13 +353,13 @@ public class BlockTermsReader extends Fi
// return NOT_FOUND so it's a waste for us to fill in
// the term that was actually NOT_FOUND
@Override
- public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+ public SeekStatus seekCeil(final BytesRef target) throws IOException {
if (indexEnum == null) {
throw new IllegalStateException("terms index was not loaded");
}
- //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
+ //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
if (didIndexNext) {
if (nextIndexTerm == null) {
//System.out.println(" nextIndexTerm=null");
@@ -377,23 +368,6 @@ public class BlockTermsReader extends Fi
}
}
- // Check cache
- if (useCache) {
- fieldTerm.term = target;
- // TODO: should we differentiate "frozen"
- // TermState (ie one that was cloned and
- // cached/returned by termState()) from the
- // malleable (primary) one?
- final TermState cachedState = termsCache.get(fieldTerm);
- if (cachedState != null) {
- seekPending = true;
- //System.out.println(" cached!");
- seekExact(target, cachedState);
- //System.out.println(" term=" + term.utf8ToString());
- return SeekStatus.FOUND;
- }
- }
-
boolean doSeek = true;
// See if we can avoid seeking, because target term
@@ -441,8 +415,7 @@ public class BlockTermsReader extends Fi
assert result;
indexIsCurrent = true;
- didIndexNext = false;
- blocksSinceSeek = 0;
+ didIndexNext = false;
if (doOrd) {
state.ord = indexEnum.ord()-1;
@@ -574,14 +547,6 @@ public class BlockTermsReader extends Fi
// Done! Exact match. Stop here, fill in
// real term, return FOUND.
//System.out.println(" FOUND");
-
- if (useCache) {
- // Store in cache
- decodeMetaData();
- //System.out.println(" cache! state=" + state);
- termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
- }
-
return SeekStatus.FOUND;
} else {
//System.out.println(" NOT_FOUND");
@@ -758,7 +723,6 @@ public class BlockTermsReader extends Fi
indexIsCurrent = true;
didIndexNext = false;
- blocksSinceSeek = 0;
seekPending = false;
state.ord = indexEnum.ord()-1;
@@ -831,8 +795,7 @@ public class BlockTermsReader extends Fi
postingsReader.readTermsBlock(in, fieldInfo, state);
- blocksSinceSeek++;
- indexIsCurrent = indexIsCurrent && (blocksSinceSeek < indexReader.getDivisor());
+ indexIsCurrent = false;
//System.out.println(" indexIsCurrent=" + indexIsCurrent);
return true;
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java Sun Aug 11 12:19:13 2013
@@ -27,7 +27,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
-import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
import java.util.HashMap;
import java.util.Comparator;
@@ -43,21 +43,15 @@ import org.apache.lucene.index.IndexFile
*/
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
- // NOTE: long is overkill here, since this number is 128
- // by default and only indexDivisor * 128 if you change
- // the indexDivisor at search time. But, we use this in a
+ // NOTE: long is overkill here, but we use this in a
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
// places (error prone), we use long here:
- private long totalIndexInterval;
-
- private int indexDivisor;
- final private int indexInterval;
-
- // Closed if indexLoaded is true:
- private IndexInput in;
- private volatile boolean indexLoaded;
+ private final long indexInterval;
+
+ private final int packedIntsVersion;
+ private final int blocksize;
private final Comparator<BytesRef> termComp;
@@ -72,35 +66,24 @@ public class FixedGapTermsIndexReader ex
// start of the field info data
private long dirOffset;
- private final int version;
-
- public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
+ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
throws IOException {
this.termComp = termComp;
-
- assert indexDivisor == -1 || indexDivisor > 0;
-
- in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
+
+ final IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
boolean success = false;
try {
- version = readHeader(in);
- indexInterval = in.readInt();
+ readHeader(in);
+ indexInterval = in.readVInt();
if (indexInterval < 1) {
throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
}
- this.indexDivisor = indexDivisor;
-
- if (indexDivisor < 0) {
- totalIndexInterval = indexInterval;
- } else {
- // In case terms index gets loaded, later, on demand
- totalIndexInterval = indexInterval * indexDivisor;
- }
- assert totalIndexInterval > 0;
+ packedIntsVersion = in.readVInt();
+ blocksize = in.readVInt();
seekDir(in, dirOffset);
@@ -112,7 +95,7 @@ public class FixedGapTermsIndexReader ex
//System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
- final int numIndexTerms = in.readVInt();
+ final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
if (numIndexTerms < 0) {
throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
}
@@ -124,47 +107,33 @@ public class FixedGapTermsIndexReader ex
throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
}
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
+ FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(in, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
}
}
success = true;
} finally {
- if (!success) {
+ if (success) {
+ IOUtils.close(in);
+ } else {
IOUtils.closeWhileHandlingException(in);
}
- if (indexDivisor > 0) {
- in.close();
- in = null;
- if (success) {
- indexLoaded = true;
- }
- termBytesReader = termBytes.freeze(true);
- }
+ termBytesReader = termBytes.freeze(true);
}
}
-
- @Override
- public int getDivisor() {
- return indexDivisor;
- }
- private int readHeader(IndexInput input) throws IOException {
- int version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
- FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_CURRENT);
- if (version < FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
- dirOffset = input.readLong();
- }
- return version;
+ private void readHeader(IndexInput input) throws IOException {
+ CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
+ FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT);
}
private class IndexEnum extends FieldIndexEnum {
- private final FieldIndexData.CoreFieldIndex fieldIndex;
+ private final FieldIndexData fieldIndex;
private final BytesRef term = new BytesRef();
private long ord;
- public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
+ public IndexEnum(FieldIndexData fieldIndex) {
this.fieldIndex = fieldIndex;
}
@@ -175,12 +144,11 @@ public class FixedGapTermsIndexReader ex
@Override
public long seek(BytesRef target) {
- int lo = 0; // binary search
- int hi = fieldIndex.numIndexTerms - 1;
- assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
+ long lo = 0; // binary search
+ long hi = fieldIndex.numIndexTerms - 1;
while (hi >= lo) {
- int mid = (lo + hi) >>> 1;
+ long mid = (lo + hi) >>> 1;
final long offset = fieldIndex.termOffsets.get(mid);
final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
@@ -193,7 +161,7 @@ public class FixedGapTermsIndexReader ex
lo = mid + 1;
} else {
assert mid >= 0;
- ord = mid*totalIndexInterval;
+ ord = mid*indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
}
}
@@ -207,17 +175,17 @@ public class FixedGapTermsIndexReader ex
final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
- ord = hi*totalIndexInterval;
+ ord = hi*indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
}
@Override
public long next() {
- final int idx = 1 + (int) (ord / totalIndexInterval);
+ final long idx = 1 + (ord / indexInterval);
if (idx >= fieldIndex.numIndexTerms) {
return -1;
}
- ord += totalIndexInterval;
+ ord += indexInterval;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
@@ -232,13 +200,13 @@ public class FixedGapTermsIndexReader ex
@Override
public long seek(long ord) {
- int idx = (int) (ord / totalIndexInterval);
+ long idx = ord / indexInterval;
// caller must ensure ord is in bounds
assert idx < fieldIndex.numIndexTerms;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
- this.ord = idx * totalIndexInterval;
+ this.ord = idx * indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
}
@@ -249,176 +217,58 @@ public class FixedGapTermsIndexReader ex
}
private final class FieldIndexData {
-
- volatile CoreFieldIndex coreIndex;
-
- private final long indexStart;
- private final long termsStart;
- private final long packedIndexStart;
- private final long packedOffsetsStart;
-
- private final int numIndexTerms;
-
- public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
- long packedOffsetsStart) throws IOException {
-
+ // where this field's terms begin in the packed byte[]
+ // data
+ final long termBytesStart;
+
+ // offset into index termBytes
+ final MonotonicBlockPackedReader termOffsets;
+
+ // index pointers into main terms dict
+ final MonotonicBlockPackedReader termsDictOffsets;
+
+ final long numIndexTerms;
+ final long termsStart;
+
+ public FieldIndexData(IndexInput in, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {
+
this.termsStart = termsStart;
- this.indexStart = indexStart;
- this.packedIndexStart = packedIndexStart;
- this.packedOffsetsStart = packedOffsetsStart;
+ termBytesStart = termBytes.getPointer();
+
+ IndexInput clone = in.clone();
+ clone.seek(indexStart);
+
this.numIndexTerms = numIndexTerms;
-
- if (indexDivisor > 0) {
- loadTermsIndex();
- }
- }
-
- private void loadTermsIndex() throws IOException {
- if (coreIndex == null) {
- coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
- }
- }
-
- private final class CoreFieldIndex {
-
- // where this field's terms begin in the packed byte[]
- // data
- final long termBytesStart;
-
- // offset into index termBytes
- final PackedInts.Reader termOffsets;
-
- // index pointers into main terms dict
- final PackedInts.Reader termsDictOffsets;
-
- final int numIndexTerms;
- final long termsStart;
-
- public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
-
- this.termsStart = termsStart;
- termBytesStart = termBytes.getPointer();
-
- IndexInput clone = in.clone();
- clone.seek(indexStart);
-
- // -1 is passed to mean "don't load term index", but
- // if we are then later loaded it's overwritten with
- // a real value
- assert indexDivisor > 0;
-
- this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
-
- assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;
-
- if (indexDivisor == 1) {
- // Default (load all index terms) is fast -- slurp in the images from disk:
-
- try {
- final long numTermBytes = packedIndexStart - indexStart;
- termBytes.copy(clone, numTermBytes);
-
- // records offsets into main terms dict file
- termsDictOffsets = PackedInts.getReader(clone);
- assert termsDictOffsets.size() == numIndexTerms;
-
- // records offsets into byte[] term data
- termOffsets = PackedInts.getReader(clone);
- assert termOffsets.size() == 1+numIndexTerms;
- } finally {
- clone.close();
- }
- } else {
- // Get packed iterators
- final IndexInput clone1 = in.clone();
- final IndexInput clone2 = in.clone();
-
- try {
- // Subsample the index terms
- clone1.seek(packedIndexStart);
- final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1, PackedInts.DEFAULT_BUFFER_SIZE);
-
- clone2.seek(packedOffsetsStart);
- final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2, PackedInts.DEFAULT_BUFFER_SIZE);
-
- // TODO: often we can get by w/ fewer bits per
- // value, below.. .but this'd be more complex:
- // we'd have to try @ fewer bits and then grow
- // if we overflowed it.
-
- PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
- PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
-
- termsDictOffsets = termsDictOffsetsM;
- termOffsets = termOffsetsM;
-
- int upto = 0;
-
- long termOffsetUpto = 0;
-
- while(upto < this.numIndexTerms) {
- // main file offset copies straight over
- termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
-
- termOffsetsM.set(upto, termOffsetUpto);
-
- long termOffset = termOffsetsIter.next();
- long nextTermOffset = termOffsetsIter.next();
- final int numTermBytes = (int) (nextTermOffset - termOffset);
-
- clone.seek(indexStart + termOffset);
- assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
- assert indexStart + termOffset + numTermBytes < clone.length();
-
- termBytes.copy(clone, numTermBytes);
- termOffsetUpto += numTermBytes;
-
- upto++;
- if (upto == this.numIndexTerms) {
- break;
- }
-
- // skip terms:
- termsDictOffsetsIter.next();
- for(int i=0;i<indexDivisor-2;i++) {
- termOffsetsIter.next();
- termsDictOffsetsIter.next();
- }
- }
- termOffsetsM.set(upto, termOffsetUpto);
-
- } finally {
- clone1.close();
- clone2.close();
- clone.close();
- }
- }
+ assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms;
+
+ // slurp in the images from disk:
+
+ try {
+ final long numTermBytes = packedIndexStart - indexStart;
+ termBytes.copy(clone, numTermBytes);
+
+ // records offsets into main terms dict file
+ termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false);
+
+ // records offsets into byte[] term data
+ termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
+ } finally {
+ clone.close();
}
}
}
@Override
public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
- final FieldIndexData fieldData = fields.get(fieldInfo);
- if (fieldData.coreIndex == null) {
- return null;
- } else {
- return new IndexEnum(fieldData.coreIndex);
- }
+ return new IndexEnum(fields.get(fieldInfo));
}
@Override
- public void close() throws IOException {
- if (in != null && !indexLoaded) {
- in.close();
- }
- }
+ public void close() throws IOException {}
private void seekDir(IndexInput input, long dirOffset) throws IOException {
- if (version >= FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
- input.seek(input.length() - 8);
- dirOffset = input.readLong();
- }
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
input.seek(dirOffset);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java Sun Aug 11 12:19:13 2013
@@ -18,15 +18,16 @@ package org.apache.lucene.codecs.blockte
*/
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
import java.util.List;
@@ -50,23 +51,32 @@ public class FixedGapTermsIndexWriter ex
final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
final static int VERSION_START = 0;
final static int VERSION_APPEND_ONLY = 1;
- final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
+ final static int VERSION_MONOTONIC_ADDRESSING = 2;
+ final static int VERSION_CURRENT = VERSION_MONOTONIC_ADDRESSING;
+ final static int BLOCKSIZE = 4096;
final private int termIndexInterval;
+ public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
- @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
-
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
+ this(state, DEFAULT_TERM_INDEX_INTERVAL);
+ }
+
+ public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval) throws IOException {
+ if (termIndexInterval <= 0) {
+ throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
+ }
+ this.termIndexInterval = termIndexInterval;
final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
- termIndexInterval = state.termIndexInterval;
out = state.directory.createOutput(indexFileName, state.context);
boolean success = false;
try {
- fieldInfos = state.fieldInfos;
writeHeader(out);
- out.writeInt(termIndexInterval);
+ out.writeVInt(termIndexInterval);
+ out.writeVInt(PackedInts.VERSION_CURRENT);
+ out.writeVInt(BLOCKSIZE);
success = true;
} finally {
if (!success) {
@@ -114,22 +124,25 @@ public class FixedGapTermsIndexWriter ex
long packedOffsetsStart;
private long numTerms;
- // TODO: we could conceivably make a PackedInts wrapper
- // that auto-grows... then we wouldn't force 6 bytes RAM
- // per index term:
- private short[] termLengths;
- private int[] termsPointerDeltas;
- private long lastTermsPointer;
- private long totTermLength;
+ private RAMOutputStream offsetsBuffer = new RAMOutputStream();
+ private MonotonicBlockPackedWriter termOffsets = new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
+ private long currentOffset;
+
+ private RAMOutputStream addressBuffer = new RAMOutputStream();
+ private MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);
private final BytesRef lastTerm = new BytesRef();
SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
- termsStart = lastTermsPointer = termsFilePointer;
- termLengths = new short[0];
- termsPointerDeltas = new int[0];
+ termsStart = termsFilePointer;
+ // we write terms+1 offsets, term n's length is n+1 - n
+ try {
+ termOffsets.add(0L);
+ } catch (IOException bogus) {
+ throw new RuntimeException(bogus);
+ }
}
@Override
@@ -157,21 +170,13 @@ public class FixedGapTermsIndexWriter ex
// against prior term
out.writeBytes(text.bytes, text.offset, indexedTermLength);
- if (termLengths.length == numIndexTerms) {
- termLengths = ArrayUtil.grow(termLengths);
- }
- if (termsPointerDeltas.length == numIndexTerms) {
- termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
- }
-
// save delta terms pointer
- termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
- lastTermsPointer = termsFilePointer;
+ termAddresses.add(termsFilePointer - termsStart);
// save term length (in bytes)
assert indexedTermLength <= Short.MAX_VALUE;
- termLengths[numIndexTerms] = (short) indexedTermLength;
- totTermLength += indexedTermLength;
+ currentOffset += indexedTermLength;
+ termOffsets.add(currentOffset);
lastTerm.copyBytes(text);
numIndexTerms++;
@@ -183,32 +188,20 @@ public class FixedGapTermsIndexWriter ex
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
- PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer), PackedInts.DEFAULT);
-
// relative to our indexStart
- long upto = 0;
- for(int i=0;i<numIndexTerms;i++) {
- upto += termsPointerDeltas[i];
- w.add(upto);
- }
- w.finish();
+ termAddresses.finish();
+ addressBuffer.writeTo(out);
packedOffsetsStart = out.getFilePointer();
// write offsets into the byte[] terms
- w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength), PackedInts.DEFAULT);
- upto = 0;
- for(int i=0;i<numIndexTerms;i++) {
- w.add(upto);
- upto += termLengths[i];
- }
- w.add(upto);
- w.finish();
+ termOffsets.finish();
+ offsetsBuffer.writeTo(out);
// our referrer holds onto us, while other fields are
// being written, so don't tie up this RAM:
- termLengths = null;
- termsPointerDeltas = null;
+ termOffsets = termAddresses = null;
+ addressBuffer = offsetsBuffer = null;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java Sun Aug 11 12:19:13 2013
@@ -47,8 +47,6 @@ public abstract class TermsIndexReaderBa
public abstract boolean supportsOrd();
- public abstract int getDivisor();
-
/**
* Similar to TermsEnum, except, the only "metadata" it
* reports for a given indexed term is the long fileOffset