You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC

svn commit: r1512909 [5/38] - in /lucene/dev/branches/lucene4956: ./ dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/ dev-tools/idea/solr/core/src/test/ dev-tool...

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ja;
 
 import java.io.IOException;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -54,7 +53,7 @@ public class TestExtendedMode extends Ba
     int numIterations = atLeast(1000);
     for (int i = 0; i < numIterations; i++) {
       String s = _TestUtil.randomUnicodeString(random(), 100);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
       ts.reset();
       while (ts.incrementToken()) {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ja;
  */
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -151,7 +150,7 @@ public class TestJapaneseAnalyzer extend
                                             Mode.SEARCH,
                                             JapaneseAnalyzer.getDefaultStopSet(),
                                             JapaneseAnalyzer.getDefaultStopTags());
-    assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
+    assertTokenStreamContents(a.tokenStream("foo", "abcd"),
                               new String[] { "a", "b", "cd"  },
                               new int[] { 0, 1, 2 },
                               new int[] { 1, 2, 4 },

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java Sun Aug 11 12:19:13 2013
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -142,7 +141,7 @@ public class TestJapaneseTokenizer exten
    * ideally the test would actually fail instead of hanging...
    */
   public void testDecomposition5() throws Exception {
-    TokenStream ts = analyzer.tokenStream("bogus", new StringReader("くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ"));
+    TokenStream ts = analyzer.tokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ");
     ts.reset();
     while (ts.incrementToken()) {
       
@@ -166,8 +165,8 @@ public class TestJapaneseTokenizer exten
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
     /*
-    //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
-    TokenStream ts = analyzer.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+    //TokenStream ts = a.tokenStream("foo", "妹の咲子です。俺と年子で、今受験生です。");
+    TokenStream ts = analyzer.tokenStream("foo", "&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;");
     ts.reset();
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     while(ts.incrementToken()) {
@@ -214,7 +213,7 @@ public class TestJapaneseTokenizer exten
   public void testLargeDocReliability() throws Exception {
     for (int i = 0; i < 100; i++) {
       String s = _TestUtil.randomUnicodeString(random(), 10000);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       ts.reset();
       while (ts.incrementToken()) {
       }
@@ -235,7 +234,7 @@ public class TestJapaneseTokenizer exten
         System.out.println("\nTEST: iter=" + i);
       }
       String s = _TestUtil.randomUnicodeString(random(), 100);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
       ts.reset();
       while (ts.incrementToken()) {
@@ -245,14 +244,14 @@ public class TestJapaneseTokenizer exten
   }
 
   public void testOnlyPunctuation() throws IOException {
-    TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。"));
+    TokenStream ts = analyzerNoPunct.tokenStream("foo", "。、。。");
     ts.reset();
     assertFalse(ts.incrementToken());
     ts.end();
   }
 
   public void testOnlyPunctuationExtended() throws IOException {
-    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......");
     ts.reset();
     assertFalse(ts.incrementToken());
     ts.end();
@@ -261,14 +260,14 @@ public class TestJapaneseTokenizer exten
   // note: test is kinda silly since kuromoji emits punctuation tokens.
   // but, when/if we filter these out it will be useful.
   public void testEnd() throws Exception {
-    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "これは本ではない"),
         new String[] { "これ", "は", "本", "で", "は", "ない" },
         new int[] { 0, 2, 3, 4, 5, 6 },
         new int[] { 2, 3, 4, 5, 6, 8 },
         new Integer(8)
     );
 
-    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない    ")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "これは本ではない    "),
         new String[] { "これ", "は", "本", "で", "は", "ない"  },
         new int[] { 0, 2, 3, 4, 5, 6, 8 },
         new int[] { 2, 3, 4, 5, 6, 8, 9 },
@@ -279,7 +278,7 @@ public class TestJapaneseTokenizer exten
   public void testUserDict() throws Exception {
     // Not a great test because w/o userdict.txt the
     // segmentation is the same:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "関西国際空港に行った"),
                               new String[] { "関西", "国際", "空港", "に", "行っ", "た"  },
                               new int[] { 0, 2, 4, 6, 7, 9 },
                               new int[] { 2, 4, 6, 7, 9, 10 },
@@ -289,7 +288,7 @@ public class TestJapaneseTokenizer exten
 
   public void testUserDict2() throws Exception {
     // Better test: w/o userdict the segmentation is different:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "朝青龍"),
                               new String[] { "朝青龍"  },
                               new int[] { 0 },
                               new int[] { 3 },
@@ -299,7 +298,7 @@ public class TestJapaneseTokenizer exten
 
   public void testUserDict3() throws Exception {
     // Test entry that breaks into multiple tokens:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "abcd"),
                               new String[] { "a", "b", "cd"  },
                               new int[] { 0, 1, 2 },
                               new int[] { 1, 2, 4 },
@@ -315,7 +314,7 @@ public class TestJapaneseTokenizer exten
   /*
   public void testUserDict4() throws Exception {
     // Test entry that has another entry as prefix
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"),
                               new String[] { "ab", "cd", "efg", "hij"  },
                               new int[] { 0, 2, 4, 7 },
                               new int[] { 2, 4, 7, 10 },
@@ -366,7 +365,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertReadings(String input, String... readings) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
     ts.reset();
     for(String reading : readings) {
@@ -378,7 +377,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertPronunciations(String input, String... pronunciations) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
     ts.reset();
     for(String pronunciation : pronunciations) {
@@ -390,7 +389,7 @@ public class TestJapaneseTokenizer exten
   }
   
   private void assertBaseForms(String input, String... baseForms) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
     ts.reset();
     for(String baseForm : baseForms) {
@@ -402,7 +401,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
     ts.reset();
     for(String inflectionType : inflectionTypes) {
@@ -414,7 +413,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
     ts.reset();
     for(String inflectionForm : inflectionForms) {
@@ -426,7 +425,7 @@ public class TestJapaneseTokenizer exten
   }
   
   private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
     ts.reset();
     for(String partOfSpeech : partsOfSpeech) {
@@ -619,7 +618,7 @@ public class TestJapaneseTokenizer exten
     if (numIterations > 1) {
       // warmup
       for (int i = 0; i < numIterations; i++) {
-        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+        final TokenStream ts = analyzer.tokenStream("ignored", line);
         ts.reset();
         while(ts.incrementToken());
       }
@@ -628,7 +627,7 @@ public class TestJapaneseTokenizer exten
 
     long totalStart = System.currentTimeMillis();
     for (int i = 0; i < numIterations; i++) {
-      final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+      final TokenStream ts = analyzer.tokenStream("ignored", line);
       ts.reset();
       while(ts.incrementToken());
     }
@@ -640,7 +639,7 @@ public class TestJapaneseTokenizer exten
     totalStart = System.currentTimeMillis();
     for (int i = 0; i < numIterations; i++) {
       for (String sentence: sentences) {
-        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+        final TokenStream ts = analyzer.tokenStream("ignored", sentence);
         ts.reset();
         while(ts.incrementToken());
       }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java Sun Aug 11 12:19:13 2013
@@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder 
     
     System.out.println("  encode...");
 
-    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
+    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
     IntsRef scratch = new IntsRef();
     long ord = -1; // first ord will be 0

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml Sun Aug 11 12:19:13 2013
@@ -19,9 +19,9 @@
 <ivy-module version="2.0">
     <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
     <dependencies>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.5" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.5" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.5" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -26,38 +26,21 @@ import org.apache.lucene.analysis.standa
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
 /**
  * {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
  */
 public class MorfologikAnalyzer extends Analyzer {
-
-  private final DICTIONARY dictionary;
   private final Version version;
 
   /**
-   * Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
-   * 
-   * @param vers
-   *          lucene compatibility version
-   * @param dict
-   *          A constant specifying which dictionary to choose. See the
-   *          Morfologik documentation for details or use the default.
-   */
-  public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
-    this.version = vers;
-    this.dictionary = dict;
-  }
-
-  /**
-   * Builds an analyzer for an original MORFOLOGIK dictionary.
+   * Builds an analyzer with the default Morfologik's dictionary (polimorf).
    * 
-   * @param vers         lucene compatibility version
+   * @param version
+   *          Lucene compatibility version
    */
-  public MorfologikAnalyzer(final Version vers) {
-    this(vers, DICTIONARY.MORFOLOGIK);
+  public MorfologikAnalyzer(final Version version) {
+    this.version = version;
   }
 
   /**
@@ -78,7 +61,7 @@ public class MorfologikAnalyzer extends 
     final Tokenizer src = new StandardTokenizer(this.version, reader);
     
     return new TokenStreamComponents(
-      src,
-      new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
+        src, 
+        new MorfologikFilter(new StandardFilter(this.version, src), this.version));
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Sun Aug 11 12:19:13 2013
@@ -20,22 +20,24 @@ package org.apache.lucene.analysis.morfo
 
 import java.io.IOException;
 import java.util.*;
+import java.util.regex.Pattern;
 
 import morfologik.stemming.*;
-import morfologik.stemming.PolishStemmer.DICTIONARY;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.*;
 
 /**
- * {@link TokenFilter} using Morfologik library.
+ * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
+ * morphosyntactic (POS) tokens. Applies to Polish only.  
  *
- * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
- * annotations for produced lemmas. See the Morfologik documentation for details.
+ * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
+ * annotations for produced lemmas. See the Morfologik documentation for details.</p>
  * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
  */
@@ -44,6 +46,7 @@ public class MorfologikFilter extends To
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
 
   private final CharsRef scratch = new CharsRef(0);
   private final CharacterUtils charUtils;
@@ -58,13 +61,11 @@ public class MorfologikFilter extends To
   private int lemmaListIndex;
 
   /**
-   * Builds a filter for given PolishStemmer.DICTIONARY enum.
-   * 
+   * Creates MorfologikFilter
    * @param in   input token stream
-   * @param dict PolishStemmer.DICTIONARY enum
    * @param version Lucene version compatibility for lowercasing.
    */
-  public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
+  public MorfologikFilter(final TokenStream in, final Version version) {
     super(in);
     this.input = in;
     
@@ -73,7 +74,7 @@ public class MorfologikFilter extends To
     ClassLoader cl = me.getContextClassLoader();
     try {
       me.setContextClassLoader(PolishStemmer.class.getClassLoader());
-      this.stemmer = new PolishStemmer(dict);
+      this.stemmer = new PolishStemmer();
       this.charUtils = CharacterUtils.getInstance(version);
       this.lemmaList = Collections.emptyList();
     } finally {
@@ -81,44 +82,30 @@ public class MorfologikFilter extends To
     }  
   }
 
+  /**
+   * A pattern used to split lemma forms.
+   */
+  private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
+
   private void popNextLemma() {
-    // Collect all tags for the next unique lemma.
-    CharSequence currentStem;
-    int tags = 0;
-    do {
-      final WordData lemma = lemmaList.get(lemmaListIndex++);
-      currentStem = lemma.getStem();
-      final CharSequence tag = lemma.getTag();
-      if (tag != null) {
-        if (tagsList.size() <= tags) {
+    // One tag (concatenated) per lemma.
+    final WordData lemma = lemmaList.get(lemmaListIndex++);
+    termAtt.setEmpty().append(lemma.getStem());
+    CharSequence tag = lemma.getTag();
+    if (tag != null) {
+      String[] tags = lemmaSplitter.split(tag.toString());
+      for (int i = 0; i < tags.length; i++) {
+        if (tagsList.size() <= i) {
           tagsList.add(new StringBuilder());
         }
-
-        final StringBuilder buffer = tagsList.get(tags++);  
+        StringBuilder buffer = tagsList.get(i);
         buffer.setLength(0);
-        buffer.append(lemma.getTag());
-      }
-    } while (lemmaListIndex < lemmaList.size() &&
-             equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
-
-    // Set the lemma's base form and tags as attributes.
-    termAtt.setEmpty().append(currentStem);
-    tagsAtt.setTags(tagsList.subList(0, tags));
-  }
-
-  /**
-   * Compare two char sequences for equality. Assumes non-null arguments. 
-   */
-  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
-    int len1 = s1.length();
-    int len2 = s2.length();
-    if (len1 != len2) return false;
-    for (int i = len1; --i >= 0;) {
-      if (s1.charAt(i) != s2.charAt(i)) { 
-        return false; 
+        buffer.append(tags[i]);
       }
+      tagsAtt.setTags(tagsList.subList(0, tags.length));
+    } else {
+      tagsAtt.setTags(Collections.<StringBuilder> emptyList());
     }
-    return true;
   }
 
   /**
@@ -140,7 +127,8 @@ public class MorfologikFilter extends To
       popNextLemma();
       return true;
     } else if (this.input.incrementToken()) {
-      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+      if (!keywordAttr.isKeyword() && 
+          (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
         current = captureState();
         popNextLemma();
       } else {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -17,12 +17,8 @@ package org.apache.lucene.analysis.morfo
  * limitations under the License.
  */
 
-import java.util.Arrays;
-import java.util.Locale;
 import java.util.Map;
 
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
@@ -32,39 +28,28 @@ import org.apache.lucene.analysis.util.T
  * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
  *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" /&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" /&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  * 
- * <p>Any of Morfologik dictionaries can be used, these are at the moment:
- * <code>MORFOLOGIK</code> (Morfologik's original dictionary),
- * <code>MORFEUSZ</code> (Morfeusz-SIAT),
- * <code>COMBINED</code> (both of the dictionaries above, combined).
- * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
  */
 public class MorfologikFilterFactory extends TokenFilterFactory {
-  /** Dictionary. */
-  private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK;
-  
   /** Schema attribute. */
+  @Deprecated
   public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
-  
+
   /** Creates a new MorfologikFilterFactory */
   public MorfologikFilterFactory(Map<String,String> args) {
     super(args);
+
+    // Be specific about no-longer-supported dictionary attribute.
     String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
     if (dictionaryName != null && !dictionaryName.isEmpty()) {
-      try {
-        DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT));
-        assert dictionary != null;
-        this.dictionary = dictionary;
-      } catch (IllegalArgumentException e) {
-        throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the "
-            + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: "  
-            + dictionaryName);
-      }
+      throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
+          + "longer supported (Morfologik has one dictionary): " + dictionaryName);
     }
+
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -72,6 +57,6 @@ public class MorfologikFilterFactory ext
 
   @Override
   public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
+    return new MorfologikFilter(ts, luceneMatchVersion);
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java Sun Aug 11 12:19:13 2013
@@ -23,9 +23,9 @@ import java.util.List;
 import org.apache.lucene.util.Attribute;
 
 /** 
- * Morfologik dictionaries provide morphosyntactic annotations for
+ * Morfologik provides morphosyntactic annotations for
  * surface forms. For the exact format and description of these,
- * see the project's documentation (annotations vary by dictionary!).
+ * see the project's documentation.
  */
 public interface MorphosyntacticTagsAttribute extends Attribute {
   /** 
@@ -36,7 +36,9 @@ public interface MorphosyntacticTagsAttr
   public void setTags(List<StringBuilder> tags);
 
   /** 
-   * Returns the POS tag of the term.
+   * Returns the POS tag of the term. A single word may have multiple POS tags, 
+   * depending on the interpretation (context disambiguation is typically needed
+   * to determine which particular tag is appropriate).  
    */
   public List<StringBuilder> getTags();
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,11 +18,19 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.IOException;
-import java.io.StringReader;
+import java.io.Reader;
 import java.util.TreeSet;
 
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
@@ -56,16 +64,28 @@ public class TestMorfologikAnalyzer exte
     assertAnalyzesToReuse(
         a,
         "T. Gl\u00FCcksberg",
-        new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
-        new int[] { 0, 0, 0, 3  },
-        new int[] { 1, 1, 1, 13 },
-        new int[] { 1, 0, 0, 1  });
+        new String[] { "tom", "tona", "Gl\u00FCcksberg" },
+        new int[] { 0, 0, 3  },
+        new int[] { 1, 1, 13 },
+        new int[] { 1, 0, 1  });
+  }
+
+  @SuppressWarnings("unused")
+  private void dumpTokens(String input) throws IOException {
+    TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
+    ts.reset();
+
+    MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
+    CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
+    while (ts.incrementToken()) {
+      System.out.println(charTerm.toString() + " => " + attribute.getTags());
+    }
   }
 
   /** Test reuse of MorfologikFilter with leftover stems. */
   public final void testLeftoverStems() throws IOException {
     Analyzer a = getTestAnalyzer();
-    TokenStream ts_1 = a.tokenStream("dummy", new StringReader("liście"));
+    TokenStream ts_1 = a.tokenStream("dummy", "liście");
     CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
     ts_1.reset();
     ts_1.incrementToken();
@@ -73,7 +93,7 @@ public class TestMorfologikAnalyzer exte
     ts_1.end();
     ts_1.close();
 
-    TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
+    TokenStream ts_2 = a.tokenStream("dummy", "danych");
     CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
     ts_2.reset();
     ts_2.incrementToken();
@@ -120,7 +140,7 @@ public class TestMorfologikAnalyzer exte
 
   /** Test morphosyntactic annotations. */
   public final void testPOSAttribute() throws IOException {
-    TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
+    TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liście");
 
     ts.reset();
     assertPOSToken(ts, "liście",  
@@ -144,6 +164,34 @@ public class TestMorfologikAnalyzer exte
     ts.close();
   }
 
+  /** */
+  public final void testKeywordAttrTokens() throws IOException {
+    final Version version = TEST_VERSION_CURRENT;
+
+    Analyzer a = new MorfologikAnalyzer(version) {
+      @Override
+      protected TokenStreamComponents createComponents(String field, Reader reader) {
+        final CharArraySet keywords = new CharArraySet(version, 1, false);
+        keywords.add("liście");
+
+        final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
+        result = new SetKeywordMarkerFilter(result, keywords);
+        result = new MorfologikFilter(result, TEST_VERSION_CURRENT); 
+
+        return new TokenStreamComponents(src, result);
+      }
+    };
+
+    assertAnalyzesToReuse(
+      a,
+      "liście danych",
+      new String[] { "liście", "dany", "dana", "dane", "dać" },
+      new int[] { 0, 7, 7, 7, 7 },
+      new int[] { 6, 13, 13, 13, 13 },
+      new int[] { 1, 1, 0, 0, 0 });
+  }
+
   /** blast some random strings through the analyzer */
   public void testRandom() throws Exception {
     checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER); 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -18,8 +18,8 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.StringReader;
+import java.util.Collections;
 import java.util.HashMap;
-import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
@@ -31,10 +31,7 @@ import org.apache.lucene.analysis.TokenS
 public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
   public void testCreateDictionary() throws Exception {
     StringReader reader = new StringReader("rowery bilety");
-    Map<String,String> initParams = new HashMap<String,String>();
-    initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
-        "morfologik");
-    MorfologikFilterFactory factory = new MorfologikFilterFactory(initParams);
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
     TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] {"rower", "bilet"});

Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Sun Aug 11 12:19:13 2013
@@ -20,8 +20,11 @@ package org.apache.lucene.analysis.cn.sm
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.util.Properties;
 
+import org.apache.lucene.util.IOUtils;
+
 /**
  * Manages analysis data configuration for SmartChineseAnalyzer
  * <p>
@@ -77,13 +80,13 @@ public class AnalyzerProfile {
     Properties prop = new Properties();
     try {
       FileInputStream input = new FileInputStream(propFile);
-      prop.load(input);
+      prop.load(new InputStreamReader(input, IOUtils.CHARSET_UTF_8));
       String dir = prop.getProperty("analysis.data.dir", "");
       input.close();
       return dir;
     } catch (IOException e) {
+      return "";
     }
-    return "";
   }
 
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cn.sm
 
 import java.io.IOException;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -185,7 +184,7 @@ public class TestSmartChineseAnalyzer ex
       sb.append("我购买了道具和服装。");
     }
     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
-    TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+    TokenStream stream = analyzer.tokenStream("", sb.toString());
     stream.reset();
     while (stream.incrementToken()) {
     }
@@ -198,7 +197,7 @@ public class TestSmartChineseAnalyzer ex
       sb.append("我购买了道具和服装");
     }
     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
-    TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+    TokenStream stream = analyzer.tokenStream("", sb.toString());
     stream.reset();
     while (stream.incrementToken()) {
     }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -35,7 +35,6 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -62,7 +61,7 @@ public class UIMABaseAnalyzerTest extend
 
   @Test
   public void baseUIMAAnalyzerStreamTest() throws Exception {
-    TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+    TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
     assertTokenStreamContents(ts, new String[]{"the", "big", "brown", "fox", "jumped", "on", "the", "wood"});
   }
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -23,8 +23,6 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.StringReader;
-
 /**
  * Testcase for {@link UIMATypeAwareAnalyzer}
  */
@@ -51,7 +49,7 @@ public class UIMATypeAwareAnalyzerTest e
   public void baseUIMATypeAwareAnalyzerStreamTest() throws Exception {
 
     // create a token stream
-    TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+    TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
 
     // check that 'the big brown fox jumped on the wood' tokens have the expected PoS types
     assertTokenStreamContents(ts,

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Sun Aug 11 12:19:13 2013
@@ -119,14 +119,9 @@ public class CreateIndexTask extends Per
       
       if (mergeScheduler.equals("org.apache.lucene.index.ConcurrentMergeScheduler")) {
         ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwConf.getMergeScheduler();
-        int v = config.get("concurrent.merge.scheduler.max.thread.count", -1);
-        if (v != -1) {
-          cms.setMaxThreadCount(v);
-        }
-        v = config.get("concurrent.merge.scheduler.max.merge.count", -1);
-        if (v != -1) {
-          cms.setMaxMergeCount(v);
-        }
+        int maxThreadCount = config.get("concurrent.merge.scheduler.max.thread.count", ConcurrentMergeScheduler.DEFAULT_MAX_THREAD_COUNT);
+        int maxMergeCount = config.get("concurrent.merge.scheduler.max.merge.count", ConcurrentMergeScheduler.DEFAULT_MAX_MERGE_COUNT);
+        cms.setMaxMergesAndThreads(maxMergeCount, maxThreadCount);
       }
     }
 
@@ -151,13 +146,10 @@ public class CreateIndexTask extends Per
       } catch (Exception e) {
         throw new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy", e);
       }
+      iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
       if(iwConf.getMergePolicy() instanceof LogMergePolicy) {
         LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
-        logMergePolicy.setUseCompoundFile(isCompound);
         logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
-      } else if(iwConf.getMergePolicy() instanceof TieredMergePolicy) {
-        TieredMergePolicy tieredMergePolicy = (TieredMergePolicy) iwConf.getMergePolicy();
-        tieredMergePolicy.setUseCompoundFile(isCompound);
       }
     }
     final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java Sun Aug 11 12:19:13 2013
@@ -18,9 +18,9 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -80,8 +80,7 @@ public class Config {
     }
     // read props from string
     this.props = new Properties();
-    // props.load always assumes iso8859-1...
-    props.load(new ByteArrayInputStream(sb.toString().getBytes("ISO-8859-1")));
+    props.load(new StringReader(sb.toString()));
 
     // make sure work dir is set properly 
     if (props.get("work.dir") == null) {

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Sun Aug 11 12:19:13 2013
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
-import java.io.StringReader;
 import java.text.Collator;
 import java.util.List;
 import java.util.Locale;
@@ -49,6 +48,7 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogDocMergePolicy;
 import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.MergePolicy;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.index.SerialMergeScheduler;
@@ -754,7 +754,7 @@ public class TestPerfTasksLogic extends 
     assertEquals(2, writer.getConfig().getMaxBufferedDocs());
     assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
     assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
-    assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
+    assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
     writer.close();
     Directory dir = benchmark.getRunData().getDirectory();
     IndexReader reader = DirectoryReader.open(dir);
@@ -978,8 +978,8 @@ public class TestPerfTasksLogic extends 
   
   private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
       throws Exception {
-    TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
-    TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
+    TokenStream ts1 = a1.tokenStream("bogus", text);
+    TokenStream ts2 = a2.tokenStream("bogus", text);
     ts1.reset();
     ts2.reset();
     TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
@@ -1029,7 +1029,7 @@ public class TestPerfTasksLogic extends 
     Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
         ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
     benchmark.getRunData().getAnalyzer().tokenStream
-        ("bogus", new StringReader(text)).close();
+        ("bogus", text).close();
     BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                              new String[] { "one", "one two", "two", "two three",
                                                             "three", "three four", "four", "four five",

Modified: lucene/dev/branches/lucene4956/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/build.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/build.xml Sun Aug 11 12:19:13 2013
@@ -183,7 +183,9 @@
     <forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
       <bundledSignatures name="jdk-unsafe-${javac.target}"/>
       <bundledSignatures name="jdk-deprecated-${javac.target}"/>
-      <signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
+      <signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
+        <include name="base.txt" />
+      </signaturesFileSet>
       <fileset dir="${basedir}/build" includes="**/*.class" />
     </forbidden-apis>
   </target>
@@ -345,7 +347,7 @@
   </target>
 
   <!-- rat-sources-typedef is *not* a useless dependency. do not remove -->
-  <target name="rat-sources" depends="rat-sources-typedef">
+  <target name="rat-sources" depends="rat-sources-typedef,common.rat-sources">
     <subant target="rat-sources" failonerror="true" inheritall="false">
       <propertyset refid="uptodate.and.compiled.properties"/>
       <fileset dir="core" includes="build.xml"/>
@@ -608,4 +610,13 @@
     <jar-checksum-macro srcdir="${common.dir}" dstdir="${common.dir}/licenses"/>
   </target>
 
+  <target name="regenerate">
+    <subant target="regenerate" failonerror="true" inheritall="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+      <fileset dir="core" includes="build.xml"/>
+      <fileset dir="test-framework" includes="build.xml"/>
+    </subant>
+    <modules-crawl target="regenerate"/>
+  </target>
+
 </project>

Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Sun Aug 11 12:19:13 2013
@@ -33,7 +33,6 @@ import org.apache.lucene.search.Wildcard
 import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.Collection;
 import java.util.LinkedList;
 
@@ -86,7 +85,7 @@ public class SimpleNaiveBayesClassifier 
 
   private String[] tokenizeDoc(String doc) throws IOException {
     Collection<String> result = new LinkedList<String>();
-    TokenStream tokenStream = analyzer.tokenStream(textFieldName, new StringReader(doc));
+    TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
     CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
     tokenStream.reset();
     while (tokenStream.incrementToken()) {

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java Sun Aug 11 12:19:13 2013
@@ -69,9 +69,6 @@ public class BlockTermsReader extends Fi
 
   private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
 
-  // Caches the most recently looked-up field + terms:
-  private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
-
   // Reads the terms index
   private TermsIndexReaderBase indexReader;
 
@@ -113,11 +110,10 @@ public class BlockTermsReader extends Fi
   // private String segment;
   
   public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, SegmentInfo info, PostingsReaderBase postingsReader, IOContext context,
-                          int termsCacheSize, String segmentSuffix)
+                          String segmentSuffix)
     throws IOException {
     
     this.postingsReader = postingsReader;
-    termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
 
     // this.segment = segment;
     in = dir.openInput(IndexFileNames.segmentFileName(info.name, segmentSuffix, BlockTermsWriter.TERMS_EXTENSION),
@@ -317,11 +313,6 @@ public class BlockTermsReader extends Fi
          calls next() (which is not "typical"), then we'll do the real seek */
       private boolean seekPending;
 
-      /* How many blocks we've read since last seek.  Once this
-         is >= indexEnum.getDivisor() we set indexIsCurrent to false (since
-         the index can no long bracket seek-within-block). */
-      private int blocksSinceSeek;
-
       private byte[] termSuffixes;
       private ByteArrayDataInput termSuffixesReader = new ByteArrayDataInput();
 
@@ -362,13 +353,13 @@ public class BlockTermsReader extends Fi
       // return NOT_FOUND so it's a waste for us to fill in
       // the term that was actually NOT_FOUND
       @Override
-      public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+      public SeekStatus seekCeil(final BytesRef target) throws IOException {
 
         if (indexEnum == null) {
           throw new IllegalStateException("terms index was not loaded");
         }
    
-        //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
+        //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
         if (didIndexNext) {
           if (nextIndexTerm == null) {
             //System.out.println("  nextIndexTerm=null");
@@ -377,23 +368,6 @@ public class BlockTermsReader extends Fi
           }
         }
 
-        // Check cache
-        if (useCache) {
-          fieldTerm.term = target;
-          // TODO: should we differentiate "frozen"
-          // TermState (ie one that was cloned and
-          // cached/returned by termState()) from the
-          // malleable (primary) one?
-          final TermState cachedState = termsCache.get(fieldTerm);
-          if (cachedState != null) {
-            seekPending = true;
-            //System.out.println("  cached!");
-            seekExact(target, cachedState);
-            //System.out.println("  term=" + term.utf8ToString());
-            return SeekStatus.FOUND;
-          }
-        }
-
         boolean doSeek = true;
 
         // See if we can avoid seeking, because target term
@@ -441,8 +415,7 @@ public class BlockTermsReader extends Fi
           assert result;
 
           indexIsCurrent = true;
-          didIndexNext = false;
-          blocksSinceSeek = 0;          
+          didIndexNext = false;      
 
           if (doOrd) {
             state.ord = indexEnum.ord()-1;
@@ -574,14 +547,6 @@ public class BlockTermsReader extends Fi
                 // Done!  Exact match.  Stop here, fill in
                 // real term, return FOUND.
                 //System.out.println("  FOUND");
-
-                if (useCache) {
-                  // Store in cache
-                  decodeMetaData();
-                  //System.out.println("  cache! state=" + state);
-                  termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
-                }
-
                 return SeekStatus.FOUND;
               } else {
                 //System.out.println("  NOT_FOUND");
@@ -758,7 +723,6 @@ public class BlockTermsReader extends Fi
 
         indexIsCurrent = true;
         didIndexNext = false;
-        blocksSinceSeek = 0;
         seekPending = false;
 
         state.ord = indexEnum.ord()-1;
@@ -831,8 +795,7 @@ public class BlockTermsReader extends Fi
 
         postingsReader.readTermsBlock(in, fieldInfo, state);
 
-        blocksSinceSeek++;
-        indexIsCurrent = indexIsCurrent && (blocksSinceSeek < indexReader.getDivisor());
+        indexIsCurrent = false;
         //System.out.println("  indexIsCurrent=" + indexIsCurrent);
 
         return true;

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java Sun Aug 11 12:19:13 2013
@@ -27,7 +27,7 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.PagedBytes;
-import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 
 import java.util.HashMap;
 import java.util.Comparator;
@@ -43,21 +43,15 @@ import org.apache.lucene.index.IndexFile
  */
 public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
 
-  // NOTE: long is overkill here, since this number is 128
-  // by default and only indexDivisor * 128 if you change
-  // the indexDivisor at search time.  But, we use this in a
+  // NOTE: long is overkill here, but we use this in a
   // number of places to multiply out the actual ord, and we
   // will overflow int during those multiplies.  So to avoid
   // having to upgrade each multiple to long in multiple
   // places (error prone), we use long here:
-  private long totalIndexInterval;
-
-  private int indexDivisor;
-  final private int indexInterval;
-
-  // Closed if indexLoaded is true:
-  private IndexInput in;
-  private volatile boolean indexLoaded;
+  private final long indexInterval;
+  
+  private final int packedIntsVersion;
+  private final int blocksize;
 
   private final Comparator<BytesRef> termComp;
 
@@ -72,35 +66,24 @@ public class FixedGapTermsIndexReader ex
   // start of the field info data
   private long dirOffset;
   
-  private final int version;
-
-  public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
+  public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
     throws IOException {
 
     this.termComp = termComp;
-
-    assert indexDivisor == -1 || indexDivisor > 0;
-
-    in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
+    
+    final IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
     
     boolean success = false;
 
     try {
       
-      version = readHeader(in);
-      indexInterval = in.readInt();
+      readHeader(in);
+      indexInterval = in.readVInt();
       if (indexInterval < 1) {
         throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
       }
-      this.indexDivisor = indexDivisor;
-
-      if (indexDivisor < 0) {
-        totalIndexInterval = indexInterval;
-      } else {
-        // In case terms index gets loaded, later, on demand
-        totalIndexInterval = indexInterval * indexDivisor;
-      }
-      assert totalIndexInterval > 0;
+      packedIntsVersion = in.readVInt();
+      blocksize = in.readVInt();
       
       seekDir(in, dirOffset);
 
@@ -112,7 +95,7 @@ public class FixedGapTermsIndexReader ex
       //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
       for(int i=0;i<numFields;i++) {
         final int field = in.readVInt();
-        final int numIndexTerms = in.readVInt();
+        final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
         if (numIndexTerms < 0) {
           throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
         }
@@ -124,47 +107,33 @@ public class FixedGapTermsIndexReader ex
           throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
         }
         final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
+        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(in, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
         if (previous != null) {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
         }
       }
       success = true;
     } finally {
-      if (!success) {
+      if (success) {
+        IOUtils.close(in);
+      } else {
         IOUtils.closeWhileHandlingException(in);
       }
-      if (indexDivisor > 0) {
-        in.close();
-        in = null;
-        if (success) {
-          indexLoaded = true;
-        }
-        termBytesReader = termBytes.freeze(true);
-      }
+      termBytesReader = termBytes.freeze(true);
     }
   }
-  
-  @Override
-  public int getDivisor() {
-    return indexDivisor;
-  }
 
-  private int readHeader(IndexInput input) throws IOException {
-    int version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
-      FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_CURRENT);
-    if (version < FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
-      dirOffset = input.readLong();
-    }
-    return version;
+  private void readHeader(IndexInput input) throws IOException {
+    CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
+      FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT);
   }
 
   private class IndexEnum extends FieldIndexEnum {
-    private final FieldIndexData.CoreFieldIndex fieldIndex;
+    private final FieldIndexData fieldIndex;
     private final BytesRef term = new BytesRef();
     private long ord;
 
-    public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
+    public IndexEnum(FieldIndexData fieldIndex) {
       this.fieldIndex = fieldIndex;
     }
 
@@ -175,12 +144,11 @@ public class FixedGapTermsIndexReader ex
 
     @Override
     public long seek(BytesRef target) {
-      int lo = 0;          // binary search
-      int hi = fieldIndex.numIndexTerms - 1;
-      assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
+      long lo = 0;          // binary search
+      long hi = fieldIndex.numIndexTerms - 1;
 
       while (hi >= lo) {
-        int mid = (lo + hi) >>> 1;
+        long mid = (lo + hi) >>> 1;
 
         final long offset = fieldIndex.termOffsets.get(mid);
         final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
@@ -193,7 +161,7 @@ public class FixedGapTermsIndexReader ex
           lo = mid + 1;
         } else {
           assert mid >= 0;
-          ord = mid*totalIndexInterval;
+          ord = mid*indexInterval;
           return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
         }
       }
@@ -207,17 +175,17 @@ public class FixedGapTermsIndexReader ex
       final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
 
-      ord = hi*totalIndexInterval;
+      ord = hi*indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
     }
 
     @Override
     public long next() {
-      final int idx = 1 + (int) (ord / totalIndexInterval);
+      final long idx = 1 + (ord / indexInterval);
       if (idx >= fieldIndex.numIndexTerms) {
         return -1;
       }
-      ord += totalIndexInterval;
+      ord += indexInterval;
 
       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
@@ -232,13 +200,13 @@ public class FixedGapTermsIndexReader ex
 
     @Override
     public long seek(long ord) {
-      int idx = (int) (ord / totalIndexInterval);
+      long idx = ord / indexInterval;
       // caller must ensure ord is in bounds
       assert idx < fieldIndex.numIndexTerms;
       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
-      this.ord = idx * totalIndexInterval;
+      this.ord = idx * indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
     }
   }
@@ -249,176 +217,58 @@ public class FixedGapTermsIndexReader ex
   }
 
   private final class FieldIndexData {
-
-    volatile CoreFieldIndex coreIndex;
-
-    private final long indexStart;
-    private final long termsStart;
-    private final long packedIndexStart;
-    private final long packedOffsetsStart;
-
-    private final int numIndexTerms;
-
-    public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
-                          long packedOffsetsStart) throws IOException {
-
+    // where this field's terms begin in the packed byte[]
+    // data
+    final long termBytesStart;
+    
+    // offset into index termBytes
+    final MonotonicBlockPackedReader termOffsets;
+    
+    // index pointers into main terms dict
+    final MonotonicBlockPackedReader termsDictOffsets;
+    
+    final long numIndexTerms;
+    final long termsStart;
+    
+    public FieldIndexData(IndexInput in, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {
+      
       this.termsStart = termsStart;
-      this.indexStart = indexStart;
-      this.packedIndexStart = packedIndexStart;
-      this.packedOffsetsStart = packedOffsetsStart;
+      termBytesStart = termBytes.getPointer();
+      
+      IndexInput clone = in.clone();
+      clone.seek(indexStart);
+      
       this.numIndexTerms = numIndexTerms;
-
-      if (indexDivisor > 0) {
-        loadTermsIndex();
-      }
-    }
-
-    private void loadTermsIndex() throws IOException {
-      if (coreIndex == null) {
-        coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
-      }
-    }
-
-    private final class CoreFieldIndex {
-
-      // where this field's terms begin in the packed byte[]
-      // data
-      final long termBytesStart;
-
-      // offset into index termBytes
-      final PackedInts.Reader termOffsets;
-
-      // index pointers into main terms dict
-      final PackedInts.Reader termsDictOffsets;
-
-      final int numIndexTerms;
-      final long termsStart;
-
-      public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
-
-        this.termsStart = termsStart;
-        termBytesStart = termBytes.getPointer();
-
-        IndexInput clone = in.clone();
-        clone.seek(indexStart);
-
-        // -1 is passed to mean "don't load term index", but
-        // if we are then later loaded it's overwritten with
-        // a real value
-        assert indexDivisor > 0;
-
-        this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
-
-        assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;
-
-        if (indexDivisor == 1) {
-          // Default (load all index terms) is fast -- slurp in the images from disk:
-          
-          try {
-            final long numTermBytes = packedIndexStart - indexStart;
-            termBytes.copy(clone, numTermBytes);
-
-            // records offsets into main terms dict file
-            termsDictOffsets = PackedInts.getReader(clone);
-            assert termsDictOffsets.size() == numIndexTerms;
-
-            // records offsets into byte[] term data
-            termOffsets = PackedInts.getReader(clone);
-            assert termOffsets.size() == 1+numIndexTerms;
-          } finally {
-            clone.close();
-          }
-        } else {
-          // Get packed iterators
-          final IndexInput clone1 = in.clone();
-          final IndexInput clone2 = in.clone();
-
-          try {
-            // Subsample the index terms
-            clone1.seek(packedIndexStart);
-            final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1, PackedInts.DEFAULT_BUFFER_SIZE);
-
-            clone2.seek(packedOffsetsStart);
-            final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2,  PackedInts.DEFAULT_BUFFER_SIZE);
-
-            // TODO: often we can get by w/ fewer bits per
-            // value, below.. .but this'd be more complex:
-            // we'd have to try @ fewer bits and then grow
-            // if we overflowed it.
-
-            PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
-            PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
-
-            termsDictOffsets = termsDictOffsetsM;
-            termOffsets = termOffsetsM;
-
-            int upto = 0;
-
-            long termOffsetUpto = 0;
-
-            while(upto < this.numIndexTerms) {
-              // main file offset copies straight over
-              termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
-
-              termOffsetsM.set(upto, termOffsetUpto);
-
-              long termOffset = termOffsetsIter.next();
-              long nextTermOffset = termOffsetsIter.next();
-              final int numTermBytes = (int) (nextTermOffset - termOffset);
-
-              clone.seek(indexStart + termOffset);
-              assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
-              assert indexStart + termOffset + numTermBytes < clone.length();
-
-              termBytes.copy(clone, numTermBytes);
-              termOffsetUpto += numTermBytes;
-
-              upto++;
-              if (upto == this.numIndexTerms) {
-                break;
-              }
-
-              // skip terms:
-              termsDictOffsetsIter.next();
-              for(int i=0;i<indexDivisor-2;i++) {
-                termOffsetsIter.next();
-                termsDictOffsetsIter.next();
-              }
-            }
-            termOffsetsM.set(upto, termOffsetUpto);
-
-          } finally {
-            clone1.close();
-            clone2.close();
-            clone.close();
-          }
-        }
+      assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms;
+      
+      // slurp in the images from disk:
+      
+      try {
+        final long numTermBytes = packedIndexStart - indexStart;
+        termBytes.copy(clone, numTermBytes);
+        
+        // records offsets into main terms dict file
+        termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false);
+        
+        // records offsets into byte[] term data
+        termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
+      } finally {
+        clone.close();
       }
     }
   }
 
   @Override
   public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
-    final FieldIndexData fieldData = fields.get(fieldInfo);
-    if (fieldData.coreIndex == null) {
-      return null;
-    } else {
-      return new IndexEnum(fieldData.coreIndex);
-    }
+    return new IndexEnum(fields.get(fieldInfo));
   }
 
   @Override
-  public void close() throws IOException {
-    if (in != null && !indexLoaded) {
-      in.close();
-    }
-  }
+  public void close() throws IOException {}
 
   private void seekDir(IndexInput input, long dirOffset) throws IOException {
-    if (version >= FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
-      input.seek(input.length() - 8);
-      dirOffset = input.readLong();
-    }
+    input.seek(input.length() - 8);
+    dirOffset = input.readLong();
     input.seek(dirOffset);
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java Sun Aug 11 12:19:13 2013
@@ -18,15 +18,16 @@ package org.apache.lucene.codecs.blockte
  */
 
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
 
 import java.util.List;
@@ -50,23 +51,32 @@ public class FixedGapTermsIndexWriter ex
   final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
   final static int VERSION_START = 0;
   final static int VERSION_APPEND_ONLY = 1;
-  final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
+  final static int VERSION_MONOTONIC_ADDRESSING = 2;
+  final static int VERSION_CURRENT = VERSION_MONOTONIC_ADDRESSING;
 
+  final static int BLOCKSIZE = 4096;
   final private int termIndexInterval;
+  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
 
   private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
   
-  @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
-
   public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
+    this(state, DEFAULT_TERM_INDEX_INTERVAL);
+  }
+  
+  public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval) throws IOException {
+    if (termIndexInterval <= 0) {
+      throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
+    }
+    this.termIndexInterval = termIndexInterval;
     final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
-    termIndexInterval = state.termIndexInterval;
     out = state.directory.createOutput(indexFileName, state.context);
     boolean success = false;
     try {
-      fieldInfos = state.fieldInfos;
       writeHeader(out);
-      out.writeInt(termIndexInterval);
+      out.writeVInt(termIndexInterval);
+      out.writeVInt(PackedInts.VERSION_CURRENT);
+      out.writeVInt(BLOCKSIZE);
       success = true;
     } finally {
       if (!success) {
@@ -114,22 +124,25 @@ public class FixedGapTermsIndexWriter ex
     long packedOffsetsStart;
     private long numTerms;
 
-    // TODO: we could conceivably make a PackedInts wrapper
-    // that auto-grows... then we wouldn't force 6 bytes RAM
-    // per index term:
-    private short[] termLengths;
-    private int[] termsPointerDeltas;
-    private long lastTermsPointer;
-    private long totTermLength;
+    private RAMOutputStream offsetsBuffer = new RAMOutputStream();
+    private MonotonicBlockPackedWriter termOffsets = new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
+    private long currentOffset;
+
+    private RAMOutputStream addressBuffer = new RAMOutputStream();
+    private MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);
 
     private final BytesRef lastTerm = new BytesRef();
 
     SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
       this.fieldInfo = fieldInfo;
       indexStart = out.getFilePointer();
-      termsStart = lastTermsPointer = termsFilePointer;
-      termLengths = new short[0];
-      termsPointerDeltas = new int[0];
+      termsStart = termsFilePointer;
+      // we write terms+1 offsets, term n's length is n+1 - n
+      try {
+        termOffsets.add(0L);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
     }
 
     @Override
@@ -157,21 +170,13 @@ public class FixedGapTermsIndexWriter ex
       // against prior term
       out.writeBytes(text.bytes, text.offset, indexedTermLength);
 
-      if (termLengths.length == numIndexTerms) {
-        termLengths = ArrayUtil.grow(termLengths);
-      }
-      if (termsPointerDeltas.length == numIndexTerms) {
-        termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
-      }
-
       // save delta terms pointer
-      termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
-      lastTermsPointer = termsFilePointer;
+      termAddresses.add(termsFilePointer - termsStart);
 
       // save term length (in bytes)
       assert indexedTermLength <= Short.MAX_VALUE;
-      termLengths[numIndexTerms] = (short) indexedTermLength;
-      totTermLength += indexedTermLength;
+      currentOffset += indexedTermLength;
+      termOffsets.add(currentOffset);
 
       lastTerm.copyBytes(text);
       numIndexTerms++;
@@ -183,32 +188,20 @@ public class FixedGapTermsIndexWriter ex
       // write primary terms dict offsets
       packedIndexStart = out.getFilePointer();
 
-      PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer), PackedInts.DEFAULT);
-
       // relative to our indexStart
-      long upto = 0;
-      for(int i=0;i<numIndexTerms;i++) {
-        upto += termsPointerDeltas[i];
-        w.add(upto);
-      }
-      w.finish();
+      termAddresses.finish();
+      addressBuffer.writeTo(out);
 
       packedOffsetsStart = out.getFilePointer();
 
       // write offsets into the byte[] terms
-      w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength), PackedInts.DEFAULT);
-      upto = 0;
-      for(int i=0;i<numIndexTerms;i++) {
-        w.add(upto);
-        upto += termLengths[i];
-      }
-      w.add(upto);
-      w.finish();
+      termOffsets.finish();
+      offsetsBuffer.writeTo(out);
 
       // our referrer holds onto us, while other fields are
       // being written, so don't tie up this RAM:
-      termLengths = null;
-      termsPointerDeltas = null;
+      termOffsets = termAddresses = null;
+      addressBuffer = offsetsBuffer = null;
     }
   }
 

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java Sun Aug 11 12:19:13 2013
@@ -47,8 +47,6 @@ public abstract class TermsIndexReaderBa
 
   public abstract boolean supportsOrd();
 
-  public abstract int getDivisor();
-
   /** 
    * Similar to TermsEnum, except, the only "metadata" it
    * reports for a given indexed term is the long fileOffset