You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC
svn commit: r1512909 [5/38] - in /lucene/dev/branches/lucene4956: ./ dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/ dev-tools/idea/solr/core/src/test/ dev-tool...

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ja;
 
 import java.io.IOException;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -54,7 +53,7 @@ public class TestExtendedMode extends Ba
     int numIterations = atLeast(1000);
     for (int i = 0; i < numIterations; i++) {
       String s = _TestUtil.randomUnicodeString(random(), 100);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
       ts.reset();
       while (ts.incrementToken()) {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ja;
  */
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -151,7 +150,7 @@ public class TestJapaneseAnalyzer extend
                                             Mode.SEARCH,
                                             JapaneseAnalyzer.getDefaultStopSet(),
                                             JapaneseAnalyzer.getDefaultStopTags());
-    assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
+    assertTokenStreamContents(a.tokenStream("foo", "abcd"),
                               new String[] { "a", "b", "cd"  },
                               new int[] { 0, 1, 2 },
                               new int[] { 1, 2, 4 },

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java Sun Aug 11 12:19:13 2013
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -142,7 +141,7 @@ public class TestJapaneseTokenizer exten
    * ideally the test would actually fail instead of hanging...
    */
   public void testDecomposition5() throws Exception {
-    TokenStream ts = analyzer.tokenStream("bogus", new StringReader("ãããããããããããããããããããããããããããããããããããããããã"));
+    TokenStream ts = analyzer.tokenStream("bogus", "ãããããããããããããããããããããããããããããããããããããããã");
     ts.reset();
     while (ts.incrementToken()) {
       
@@ -166,8 +165,8 @@ public class TestJapaneseTokenizer exten
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
     /*
-    //TokenStream ts = a.tokenStream("foo", new StringReader("å¦¹ã®å²åã§ããä¿ºã¨å¹´åã§ãä»åé¨çã§ãã"));
-    TokenStream ts = analyzer.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+    //TokenStream ts = a.tokenStream("foo", "å¦¹ã®å²åã§ããä¿ºã¨å¹´åã§ãä»åé¨çã§ãã");
+    TokenStream ts = analyzer.tokenStream("foo", "&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;");
     ts.reset();
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     while(ts.incrementToken()) {
@@ -214,7 +213,7 @@ public class TestJapaneseTokenizer exten
   public void testLargeDocReliability() throws Exception {
     for (int i = 0; i < 100; i++) {
       String s = _TestUtil.randomUnicodeString(random(), 10000);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       ts.reset();
       while (ts.incrementToken()) {
       }
@@ -235,7 +234,7 @@ public class TestJapaneseTokenizer exten
         System.out.println("\nTEST: iter=" + i);
       }
       String s = _TestUtil.randomUnicodeString(random(), 100);
-      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      TokenStream ts = analyzer.tokenStream("foo", s);
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
       ts.reset();
       while (ts.incrementToken()) {
@@ -245,14 +244,14 @@ public class TestJapaneseTokenizer exten
   }
 
   public void testOnlyPunctuation() throws IOException {
-    TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("ãããã"));
+    TokenStream ts = analyzerNoPunct.tokenStream("foo", "ãããã");
     ts.reset();
     assertFalse(ts.incrementToken());
     ts.end();
   }
 
   public void testOnlyPunctuationExtended() throws IOException {
-    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......");
     ts.reset();
     assertFalse(ts.incrementToken());
     ts.end();
@@ -261,14 +260,14 @@ public class TestJapaneseTokenizer exten
   // note: test is kinda silly since kuromoji emits punctuation tokens.
   // but, when/if we filter these out it will be useful.
   public void testEnd() throws Exception {
-    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "ããã¯æ¬ã§ã¯ãªã"),
         new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
         new int[] { 0, 2, 3, 4, 5, 6 },
         new int[] { 2, 3, 4, 5, 6, 8 },
         new Integer(8)
     );
 
-    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("ããã¯æ¬ã§ã¯ãªã    ")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", "ããã¯æ¬ã§ã¯ãªã    "),
         new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã"  },
         new int[] { 0, 2, 3, 4, 5, 6, 8 },
         new int[] { 2, 3, 4, 5, 6, 8, 9 },
@@ -279,7 +278,7 @@ public class TestJapaneseTokenizer exten
   public void testUserDict() throws Exception {
     // Not a great test because w/o userdict.txt the
     // segmentation is the same:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã"),
                               new String[] { "é¢è¥¿", "å½é", "ç©ºæ¸¯", "ã«", "è¡ã£", "ã"  },
                               new int[] { 0, 2, 4, 6, 7, 9 },
                               new int[] { 2, 4, 6, 7, 9, 10 },
@@ -289,7 +288,7 @@ public class TestJapaneseTokenizer exten
 
   public void testUserDict2() throws Exception {
     // Better test: w/o userdict the segmentation is different:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("æéé¾")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "æéé¾"),
                               new String[] { "æéé¾"  },
                               new int[] { 0 },
                               new int[] { 3 },
@@ -299,7 +298,7 @@ public class TestJapaneseTokenizer exten
 
   public void testUserDict3() throws Exception {
     // Test entry that breaks into multiple tokens:
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "abcd"),
                               new String[] { "a", "b", "cd"  },
                               new int[] { 0, 1, 2 },
                               new int[] { 1, 2, 4 },
@@ -315,7 +314,7 @@ public class TestJapaneseTokenizer exten
   /*
   public void testUserDict4() throws Exception {
     // Test entry that has another entry as prefix
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+    assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"),
                               new String[] { "ab", "cd", "efg", "hij"  },
                               new int[] { 0, 2, 4, 7 },
                               new int[] { 2, 4, 7, 10 },
@@ -366,7 +365,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertReadings(String input, String... readings) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
     ts.reset();
     for(String reading : readings) {
@@ -378,7 +377,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertPronunciations(String input, String... pronunciations) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
     ts.reset();
     for(String pronunciation : pronunciations) {
@@ -390,7 +389,7 @@ public class TestJapaneseTokenizer exten
   }
   
   private void assertBaseForms(String input, String... baseForms) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
     ts.reset();
     for(String baseForm : baseForms) {
@@ -402,7 +401,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
     ts.reset();
     for(String inflectionType : inflectionTypes) {
@@ -414,7 +413,7 @@ public class TestJapaneseTokenizer exten
   }
 
   private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
     ts.reset();
     for(String inflectionForm : inflectionForms) {
@@ -426,7 +425,7 @@ public class TestJapaneseTokenizer exten
   }
   
   private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
-    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    TokenStream ts = analyzer.tokenStream("ignored", input);
     PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
     ts.reset();
     for(String partOfSpeech : partsOfSpeech) {
@@ -619,7 +618,7 @@ public class TestJapaneseTokenizer exten
     if (numIterations > 1) {
       // warmup
       for (int i = 0; i < numIterations; i++) {
-        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+        final TokenStream ts = analyzer.tokenStream("ignored", line);
         ts.reset();
         while(ts.incrementToken());
       }
@@ -628,7 +627,7 @@ public class TestJapaneseTokenizer exten
 
     long totalStart = System.currentTimeMillis();
     for (int i = 0; i < numIterations; i++) {
-      final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+      final TokenStream ts = analyzer.tokenStream("ignored", line);
       ts.reset();
       while(ts.incrementToken());
     }
@@ -640,7 +639,7 @@ public class TestJapaneseTokenizer exten
     totalStart = System.currentTimeMillis();
     for (int i = 0; i < numIterations; i++) {
       for (String sentence: sentences) {
-        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+        final TokenStream ts = analyzer.tokenStream("ignored", sentence);
         ts.reset();
         while(ts.incrementToken());
       }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java Sun Aug 11 12:19:13 2013
@@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder 
     
     System.out.println("  encode...");
 
-    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
+    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
     IntsRef scratch = new IntsRef();
     long ord = -1; // first ord will be 0

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/ivy.xml Sun Aug 11 12:19:13 2013
@@ -19,9 +19,9 @@
 <ivy-module version="2.0">
     <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
     <dependencies>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.5" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.5" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.5" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -26,38 +26,21 @@ import org.apache.lucene.analysis.standa
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
 /**
  * {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
  */
 public class MorfologikAnalyzer extends Analyzer {
-
-  private final DICTIONARY dictionary;
   private final Version version;
 
   /**
-   * Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
-   * 
-   * @param vers
-   *          lucene compatibility version
-   * @param dict
-   *          A constant specifying which dictionary to choose. See the
-   *          Morfologik documentation for details or use the default.
-   */
-  public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
-    this.version = vers;
-    this.dictionary = dict;
-  }
-
-  /**
-   * Builds an analyzer for an original MORFOLOGIK dictionary.
+   * Builds an analyzer with the default Morfologik's dictionary (polimorf).
    * 
-   * @param vers         lucene compatibility version
+   * @param version
+   *          Lucene compatibility version
    */
-  public MorfologikAnalyzer(final Version vers) {
-    this(vers, DICTIONARY.MORFOLOGIK);
+  public MorfologikAnalyzer(final Version version) {
+    this.version = version;
   }
 
   /**
@@ -78,7 +61,7 @@ public class MorfologikAnalyzer extends 
     final Tokenizer src = new StandardTokenizer(this.version, reader);
     
     return new TokenStreamComponents(
-      src,
-      new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
+        src, 
+        new MorfologikFilter(new StandardFilter(this.version, src), this.version));
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Sun Aug 11 12:19:13 2013
@@ -20,22 +20,24 @@ package org.apache.lucene.analysis.morfo
 
 import java.io.IOException;
 import java.util.*;
+import java.util.regex.Pattern;
 
 import morfologik.stemming.*;
-import morfologik.stemming.PolishStemmer.DICTIONARY;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.*;
 
 /**
- * {@link TokenFilter} using Morfologik library.
+ * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
+ * morphosyntactic (POS) tokens. Applies to Polish only.  
  *
- * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
- * annotations for produced lemmas. See the Morfologik documentation for details.
+ * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
+ * annotations for produced lemmas. See the Morfologik documentation for details.</p>
  * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
  */
@@ -44,6 +46,7 @@ public class MorfologikFilter extends To
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
 
   private final CharsRef scratch = new CharsRef(0);
   private final CharacterUtils charUtils;
@@ -58,13 +61,11 @@ public class MorfologikFilter extends To
   private int lemmaListIndex;
 
   /**
-   * Builds a filter for given PolishStemmer.DICTIONARY enum.
-   * 
+   * Creates MorfologikFilter
    * @param in   input token stream
-   * @param dict PolishStemmer.DICTIONARY enum
    * @param version Lucene version compatibility for lowercasing.
    */
-  public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
+  public MorfologikFilter(final TokenStream in, final Version version) {
     super(in);
     this.input = in;
     
@@ -73,7 +74,7 @@ public class MorfologikFilter extends To
     ClassLoader cl = me.getContextClassLoader();
     try {
       me.setContextClassLoader(PolishStemmer.class.getClassLoader());
-      this.stemmer = new PolishStemmer(dict);
+      this.stemmer = new PolishStemmer();
       this.charUtils = CharacterUtils.getInstance(version);
       this.lemmaList = Collections.emptyList();
     } finally {
@@ -81,44 +82,30 @@ public class MorfologikFilter extends To
     }  
   }
 
+  /**
+   * A pattern used to split lemma forms.
+   */
+  private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
+
   private void popNextLemma() {
-    // Collect all tags for the next unique lemma.
-    CharSequence currentStem;
-    int tags = 0;
-    do {
-      final WordData lemma = lemmaList.get(lemmaListIndex++);
-      currentStem = lemma.getStem();
-      final CharSequence tag = lemma.getTag();
-      if (tag != null) {
-        if (tagsList.size() <= tags) {
+    // One tag (concatenated) per lemma.
+    final WordData lemma = lemmaList.get(lemmaListIndex++);
+    termAtt.setEmpty().append(lemma.getStem());
+    CharSequence tag = lemma.getTag();
+    if (tag != null) {
+      String[] tags = lemmaSplitter.split(tag.toString());
+      for (int i = 0; i < tags.length; i++) {
+        if (tagsList.size() <= i) {
           tagsList.add(new StringBuilder());
         }
-
-        final StringBuilder buffer = tagsList.get(tags++);  
+        StringBuilder buffer = tagsList.get(i);
         buffer.setLength(0);
-        buffer.append(lemma.getTag());
-      }
-    } while (lemmaListIndex < lemmaList.size() &&
-             equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
-
-    // Set the lemma's base form and tags as attributes.
-    termAtt.setEmpty().append(currentStem);
-    tagsAtt.setTags(tagsList.subList(0, tags));
-  }
-
-  /**
-   * Compare two char sequences for equality. Assumes non-null arguments. 
-   */
-  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
-    int len1 = s1.length();
-    int len2 = s2.length();
-    if (len1 != len2) return false;
-    for (int i = len1; --i >= 0;) {
-      if (s1.charAt(i) != s2.charAt(i)) { 
-        return false; 
+        buffer.append(tags[i]);
       }
+      tagsAtt.setTags(tagsList.subList(0, tags.length));
+    } else {
+      tagsAtt.setTags(Collections.<StringBuilder> emptyList());
     }
-    return true;
   }
 
   /**
@@ -140,7 +127,8 @@ public class MorfologikFilter extends To
       popNextLemma();
       return true;
     } else if (this.input.incrementToken()) {
-      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+      if (!keywordAttr.isKeyword() && 
+          (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
         current = captureState();
         popNextLemma();
       } else {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -17,12 +17,8 @@ package org.apache.lucene.analysis.morfo
  * limitations under the License.
  */
 
-import java.util.Arrays;
-import java.util.Locale;
 import java.util.Map;
 
-import morfologik.stemming.PolishStemmer.DICTIONARY;
-
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
@@ -32,39 +28,28 @@ import org.apache.lucene.analysis.util.T
  * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
  *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" /&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" /&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  * 
- * <p>Any of Morfologik dictionaries can be used, these are at the moment:
- * <code>MORFOLOGIK</code> (Morfologik's original dictionary),
- * <code>MORFEUSZ</code> (Morfeusz-SIAT),
- * <code>COMBINED</code> (both of the dictionaries above, combined).
- * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
  */
 public class MorfologikFilterFactory extends TokenFilterFactory {
-  /** Dictionary. */
-  private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK;
-  
   /** Schema attribute. */
+  @Deprecated
   public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
-  
+
   /** Creates a new MorfologikFilterFactory */
   public MorfologikFilterFactory(Map<String,String> args) {
     super(args);
+
+    // Be specific about no-longer-supported dictionary attribute.
     String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
     if (dictionaryName != null && !dictionaryName.isEmpty()) {
-      try {
-        DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT));
-        assert dictionary != null;
-        this.dictionary = dictionary;
-      } catch (IllegalArgumentException e) {
-        throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the "
-            + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: "  
-            + dictionaryName);
-      }
+      throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
+          + "longer supported (Morfologik has one dictionary): " + dictionaryName);
     }
+
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -72,6 +57,6 @@ public class MorfologikFilterFactory ext
 
   @Override
   public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
+    return new MorfologikFilter(ts, luceneMatchVersion);
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java Sun Aug 11 12:19:13 2013
@@ -23,9 +23,9 @@ import java.util.List;
 import org.apache.lucene.util.Attribute;
 
 /** 
- * Morfologik dictionaries provide morphosyntactic annotations for
+ * Morfologik provides morphosyntactic annotations for
  * surface forms. For the exact format and description of these,
- * see the project's documentation (annotations vary by dictionary!).
+ * see the project's documentation.
  */
 public interface MorphosyntacticTagsAttribute extends Attribute {
   /** 
@@ -36,7 +36,9 @@ public interface MorphosyntacticTagsAttr
   public void setTags(List<StringBuilder> tags);
 
   /** 
-   * Returns the POS tag of the term.
+   * Returns the POS tag of the term. A single word may have multiple POS tags, 
+   * depending on the interpretation (context disambiguation is typically needed
+   * to determine which particular tag is appropriate).  
    */
   public List<StringBuilder> getTags();
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -18,11 +18,19 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.IOException;
-import java.io.StringReader;
+import java.io.Reader;
 import java.util.TreeSet;
 
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
@@ -56,16 +64,28 @@ public class TestMorfologikAnalyzer exte
     assertAnalyzesToReuse(
         a,
         "T. Gl\u00FCcksberg",
-        new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
-        new int[] { 0, 0, 0, 3  },
-        new int[] { 1, 1, 1, 13 },
-        new int[] { 1, 0, 0, 1  });
+        new String[] { "tom", "tona", "Gl\u00FCcksberg" },
+        new int[] { 0, 0, 3  },
+        new int[] { 1, 1, 13 },
+        new int[] { 1, 0, 1  });
+  }
+
+  @SuppressWarnings("unused")
+  private void dumpTokens(String input) throws IOException {
+    TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
+    ts.reset();
+
+    MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
+    CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
+    while (ts.incrementToken()) {
+      System.out.println(charTerm.toString() + " => " + attribute.getTags());
+    }
   }
 
   /** Test reuse of MorfologikFilter with leftover stems. */
   public final void testLeftoverStems() throws IOException {
     Analyzer a = getTestAnalyzer();
-    TokenStream ts_1 = a.tokenStream("dummy", new StringReader("liÅcie"));
+    TokenStream ts_1 = a.tokenStream("dummy", "liÅcie");
     CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
     ts_1.reset();
     ts_1.incrementToken();
@@ -73,7 +93,7 @@ public class TestMorfologikAnalyzer exte
     ts_1.end();
     ts_1.close();
 
-    TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
+    TokenStream ts_2 = a.tokenStream("dummy", "danych");
     CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
     ts_2.reset();
     ts_2.incrementToken();
@@ -120,7 +140,7 @@ public class TestMorfologikAnalyzer exte
 
   /** Test morphosyntactic annotations. */
   public final void testPOSAttribute() throws IOException {
-    TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liÅcie"));
+    TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liÅcie");
 
     ts.reset();
     assertPOSToken(ts, "liÅcie",  
@@ -144,6 +164,34 @@ public class TestMorfologikAnalyzer exte
     ts.close();
   }
 
+  /** */
+  public final void testKeywordAttrTokens() throws IOException {
+    final Version version = TEST_VERSION_CURRENT;
+
+    Analyzer a = new MorfologikAnalyzer(version) {
+      @Override
+      protected TokenStreamComponents createComponents(String field, Reader reader) {
+        final CharArraySet keywords = new CharArraySet(version, 1, false);
+        keywords.add("liÅcie");
+
+        final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
+        result = new SetKeywordMarkerFilter(result, keywords);
+        result = new MorfologikFilter(result, TEST_VERSION_CURRENT); 
+
+        return new TokenStreamComponents(src, result);
+      }
+    };
+
+    assertAnalyzesToReuse(
+      a,
+      "liÅcie danych",
+      new String[] { "liÅcie", "dany", "dana", "dane", "daÄ" },
+      new int[] { 0, 7, 7, 7, 7 },
+      new int[] { 6, 13, 13, 13, 13 },
+      new int[] { 1, 1, 0, 0, 0 });
+  }
+
   /** blast some random strings through the analyzer */
   public void testRandom() throws Exception {
     checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER); 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Sun Aug 11 12:19:13 2013
@@ -18,8 +18,8 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.StringReader;
+import java.util.Collections;
 import java.util.HashMap;
-import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
@@ -31,10 +31,7 @@ import org.apache.lucene.analysis.TokenS
 public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
   public void testCreateDictionary() throws Exception {
     StringReader reader = new StringReader("rowery bilety");
-    Map<String,String> initParams = new HashMap<String,String>();
-    initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
-        "morfologik");
-    MorfologikFilterFactory factory = new MorfologikFilterFactory(initParams);
+    MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
     TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] {"rower", "bilet"});

Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Sun Aug 11 12:19:13 2013
@@ -20,8 +20,11 @@ package org.apache.lucene.analysis.cn.sm
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.util.Properties;
 
+import org.apache.lucene.util.IOUtils;
+
 /**
  * Manages analysis data configuration for SmartChineseAnalyzer
  * <p>
@@ -77,13 +80,13 @@ public class AnalyzerProfile {
     Properties prop = new Properties();
     try {
       FileInputStream input = new FileInputStream(propFile);
-      prop.load(input);
+      prop.load(new InputStreamReader(input, IOUtils.CHARSET_UTF_8));
       String dir = prop.getProperty("analysis.data.dir", "");
       input.close();
       return dir;
     } catch (IOException e) {
+      return "";
     }
-    return "";
   }
 
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Sun Aug 11 12:19:13 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cn.sm
 
 import java.io.IOException;
 import java.io.Reader;
-import java.io.StringReader;
 import java.util.Random;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -185,7 +184,7 @@ public class TestSmartChineseAnalyzer ex
       sb.append("æè´ä¹°äºéå·åæè£ã");
     }
     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
-    TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+    TokenStream stream = analyzer.tokenStream("", sb.toString());
     stream.reset();
     while (stream.incrementToken()) {
     }
@@ -198,7 +197,7 @@ public class TestSmartChineseAnalyzer ex
       sb.append("æè´ä¹°äºéå·åæè£");
     }
     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
-    TokenStream stream = analyzer.tokenStream("", new StringReader(sb.toString()));
+    TokenStream stream = analyzer.tokenStream("", sb.toString());
     stream.reset();
     while (stream.incrementToken()) {
     }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -35,7 +35,6 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -62,7 +61,7 @@ public class UIMABaseAnalyzerTest extend
 
   @Test
   public void baseUIMAAnalyzerStreamTest() throws Exception {
-    TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+    TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
     assertTokenStreamContents(ts, new String[]{"the", "big", "brown", "fox", "jumped", "on", "the", "wood"});
   }
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java Sun Aug 11 12:19:13 2013
@@ -23,8 +23,6 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.StringReader;
-
 /**
  * Testcase for {@link UIMATypeAwareAnalyzer}
  */
@@ -51,7 +49,7 @@ public class UIMATypeAwareAnalyzerTest e
   public void baseUIMATypeAwareAnalyzerStreamTest() throws Exception {
 
     // create a token stream
-    TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
+    TokenStream ts = analyzer.tokenStream("text", "the big brown fox jumped on the wood");
 
     // check that 'the big brown fox jumped on the wood' tokens have the expected PoS types
     assertTokenStreamContents(ts,

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Sun Aug 11 12:19:13 2013
@@ -119,14 +119,9 @@ public class CreateIndexTask extends Per
       
       if (mergeScheduler.equals("org.apache.lucene.index.ConcurrentMergeScheduler")) {
         ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwConf.getMergeScheduler();
-        int v = config.get("concurrent.merge.scheduler.max.thread.count", -1);
-        if (v != -1) {
-          cms.setMaxThreadCount(v);
-        }
-        v = config.get("concurrent.merge.scheduler.max.merge.count", -1);
-        if (v != -1) {
-          cms.setMaxMergeCount(v);
-        }
+        int maxThreadCount = config.get("concurrent.merge.scheduler.max.thread.count", ConcurrentMergeScheduler.DEFAULT_MAX_THREAD_COUNT);
+        int maxMergeCount = config.get("concurrent.merge.scheduler.max.merge.count", ConcurrentMergeScheduler.DEFAULT_MAX_MERGE_COUNT);
+        cms.setMaxMergesAndThreads(maxMergeCount, maxThreadCount);
       }
     }
 
@@ -151,13 +146,10 @@ public class CreateIndexTask extends Per
       } catch (Exception e) {
         throw new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy", e);
       }
+      iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
       if(iwConf.getMergePolicy() instanceof LogMergePolicy) {
         LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
-        logMergePolicy.setUseCompoundFile(isCompound);
         logMergePolicy.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR));
-      } else if(iwConf.getMergePolicy() instanceof TieredMergePolicy) {
-        TieredMergePolicy tieredMergePolicy = (TieredMergePolicy) iwConf.getMergePolicy();
-        tieredMergePolicy.setUseCompoundFile(isCompound);
       }
     }
     final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java Sun Aug 11 12:19:13 2013
@@ -18,9 +18,9 @@ package org.apache.lucene.benchmark.byTa
  */
 
 import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -80,8 +80,7 @@ public class Config {
     }
     // read props from string
     this.props = new Properties();
-    // props.load always assumes iso8859-1...
-    props.load(new ByteArrayInputStream(sb.toString().getBytes("ISO-8859-1")));
+    props.load(new StringReader(sb.toString()));
 
     // make sure work dir is set properly 
     if (props.get("work.dir") == null) {

Modified: lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/lucene4956/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Sun Aug 11 12:19:13 2013
@@ -21,7 +21,6 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
-import java.io.StringReader;
 import java.text.Collator;
 import java.util.List;
 import java.util.Locale;
@@ -49,6 +48,7 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogDocMergePolicy;
 import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.MergePolicy;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.index.SerialMergeScheduler;
@@ -754,7 +754,7 @@ public class TestPerfTasksLogic extends 
     assertEquals(2, writer.getConfig().getMaxBufferedDocs());
     assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
     assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
-    assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
+    assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
     writer.close();
     Directory dir = benchmark.getRunData().getDirectory();
     IndexReader reader = DirectoryReader.open(dir);
@@ -978,8 +978,8 @@ public class TestPerfTasksLogic extends 
   
   private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
       throws Exception {
-    TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
-    TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
+    TokenStream ts1 = a1.tokenStream("bogus", text);
+    TokenStream ts2 = a2.tokenStream("bogus", text);
     ts1.reset();
     ts2.reset();
     TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
@@ -1029,7 +1029,7 @@ public class TestPerfTasksLogic extends 
     Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
         ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
     benchmark.getRunData().getAnalyzer().tokenStream
-        ("bogus", new StringReader(text)).close();
+        ("bogus", text).close();
     BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
                                              new String[] { "one", "one two", "two", "two three",
                                                             "three", "three four", "four", "four five",

Modified: lucene/dev/branches/lucene4956/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/build.xml?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/build.xml Sun Aug 11 12:19:13 2013
@@ -183,7 +183,9 @@
     <forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
       <bundledSignatures name="jdk-unsafe-${javac.target}"/>
       <bundledSignatures name="jdk-deprecated-${javac.target}"/>
-      <signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
+      <signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
+        <include name="base.txt" />
+      </signaturesFileSet>
       <fileset dir="${basedir}/build" includes="**/*.class" />
     </forbidden-apis>
   </target>
@@ -345,7 +347,7 @@
   </target>
 
   <!-- rat-sources-typedef is *not* a useless dependency. do not remove -->
-  <target name="rat-sources" depends="rat-sources-typedef">
+  <target name="rat-sources" depends="rat-sources-typedef,common.rat-sources">
     <subant target="rat-sources" failonerror="true" inheritall="false">
       <propertyset refid="uptodate.and.compiled.properties"/>
       <fileset dir="core" includes="build.xml"/>
@@ -608,4 +610,13 @@
     <jar-checksum-macro srcdir="${common.dir}" dstdir="${common.dir}/licenses"/>
   </target>
 
+  <target name="regenerate">
+    <subant target="regenerate" failonerror="true" inheritall="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+      <fileset dir="core" includes="build.xml"/>
+      <fileset dir="test-framework" includes="build.xml"/>
+    </subant>
+    <modules-crawl target="regenerate"/>
+  </target>
+
 </project>

Modified: lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene4956/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Sun Aug 11 12:19:13 2013
@@ -33,7 +33,6 @@ import org.apache.lucene.search.Wildcard
 import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.Collection;
 import java.util.LinkedList;
 
@@ -86,7 +85,7 @@ public class SimpleNaiveBayesClassifier 
 
   private String[] tokenizeDoc(String doc) throws IOException {
     Collection<String> result = new LinkedList<String>();
-    TokenStream tokenStream = analyzer.tokenStream(textFieldName, new StringReader(doc));
+    TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
     CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
     tokenStream.reset();
     while (tokenStream.incrementToken()) {

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java Sun Aug 11 12:19:13 2013
@@ -69,9 +69,6 @@ public class BlockTermsReader extends Fi
 
   private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
 
-  // Caches the most recently looked-up field + terms:
-  private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
-
   // Reads the terms index
   private TermsIndexReaderBase indexReader;
 
@@ -113,11 +110,10 @@ public class BlockTermsReader extends Fi
   // private String segment;
   
   public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, SegmentInfo info, PostingsReaderBase postingsReader, IOContext context,
-                          int termsCacheSize, String segmentSuffix)
+                          String segmentSuffix)
     throws IOException {
     
     this.postingsReader = postingsReader;
-    termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
 
     // this.segment = segment;
     in = dir.openInput(IndexFileNames.segmentFileName(info.name, segmentSuffix, BlockTermsWriter.TERMS_EXTENSION),
@@ -317,11 +313,6 @@ public class BlockTermsReader extends Fi
          calls next() (which is not "typical"), then we'll do the real seek */
       private boolean seekPending;
 
-      /* How many blocks we've read since last seek.  Once this
-         is >= indexEnum.getDivisor() we set indexIsCurrent to false (since
-         the index can no long bracket seek-within-block). */
-      private int blocksSinceSeek;
-
       private byte[] termSuffixes;
       private ByteArrayDataInput termSuffixesReader = new ByteArrayDataInput();
 
@@ -362,13 +353,13 @@ public class BlockTermsReader extends Fi
       // return NOT_FOUND so it's a waste for us to fill in
       // the term that was actually NOT_FOUND
       @Override
-      public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+      public SeekStatus seekCeil(final BytesRef target) throws IOException {
 
         if (indexEnum == null) {
           throw new IllegalStateException("terms index was not loaded");
         }
    
-        //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
+        //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
         if (didIndexNext) {
           if (nextIndexTerm == null) {
             //System.out.println("  nextIndexTerm=null");
@@ -377,23 +368,6 @@ public class BlockTermsReader extends Fi
           }
         }
 
-        // Check cache
-        if (useCache) {
-          fieldTerm.term = target;
-          // TODO: should we differentiate "frozen"
-          // TermState (ie one that was cloned and
-          // cached/returned by termState()) from the
-          // malleable (primary) one?
-          final TermState cachedState = termsCache.get(fieldTerm);
-          if (cachedState != null) {
-            seekPending = true;
-            //System.out.println("  cached!");
-            seekExact(target, cachedState);
-            //System.out.println("  term=" + term.utf8ToString());
-            return SeekStatus.FOUND;
-          }
-        }
-
         boolean doSeek = true;
 
         // See if we can avoid seeking, because target term
@@ -441,8 +415,7 @@ public class BlockTermsReader extends Fi
           assert result;
 
           indexIsCurrent = true;
-          didIndexNext = false;
-          blocksSinceSeek = 0;          
+          didIndexNext = false;      
 
           if (doOrd) {
             state.ord = indexEnum.ord()-1;
@@ -574,14 +547,6 @@ public class BlockTermsReader extends Fi
                 // Done!  Exact match.  Stop here, fill in
                 // real term, return FOUND.
                 //System.out.println("  FOUND");
-
-                if (useCache) {
-                  // Store in cache
-                  decodeMetaData();
-                  //System.out.println("  cache! state=" + state);
-                  termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
-                }
-
                 return SeekStatus.FOUND;
               } else {
                 //System.out.println("  NOT_FOUND");
@@ -758,7 +723,6 @@ public class BlockTermsReader extends Fi
 
         indexIsCurrent = true;
         didIndexNext = false;
-        blocksSinceSeek = 0;
         seekPending = false;
 
         state.ord = indexEnum.ord()-1;
@@ -831,8 +795,7 @@ public class BlockTermsReader extends Fi
 
         postingsReader.readTermsBlock(in, fieldInfo, state);
 
-        blocksSinceSeek++;
-        indexIsCurrent = indexIsCurrent && (blocksSinceSeek < indexReader.getDivisor());
+        indexIsCurrent = false;
         //System.out.println("  indexIsCurrent=" + indexIsCurrent);
 
         return true;

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java Sun Aug 11 12:19:13 2013
@@ -27,7 +27,7 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.PagedBytes;
-import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 
 import java.util.HashMap;
 import java.util.Comparator;
@@ -43,21 +43,15 @@ import org.apache.lucene.index.IndexFile
  */
 public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
 
-  // NOTE: long is overkill here, since this number is 128
-  // by default and only indexDivisor * 128 if you change
-  // the indexDivisor at search time.  But, we use this in a
+  // NOTE: long is overkill here, but we use this in a
   // number of places to multiply out the actual ord, and we
   // will overflow int during those multiplies.  So to avoid
   // having to upgrade each multiple to long in multiple
   // places (error prone), we use long here:
-  private long totalIndexInterval;
-
-  private int indexDivisor;
-  final private int indexInterval;
-
-  // Closed if indexLoaded is true:
-  private IndexInput in;
-  private volatile boolean indexLoaded;
+  private final long indexInterval;
+  
+  private final int packedIntsVersion;
+  private final int blocksize;
 
   private final Comparator<BytesRef> termComp;
 
@@ -72,35 +66,24 @@ public class FixedGapTermsIndexReader ex
   // start of the field info data
   private long dirOffset;
   
-  private final int version;
-
-  public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
+  public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context)
     throws IOException {
 
     this.termComp = termComp;
-
-    assert indexDivisor == -1 || indexDivisor > 0;
-
-    in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
+    
+    final IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
     
     boolean success = false;
 
     try {
       
-      version = readHeader(in);
-      indexInterval = in.readInt();
+      readHeader(in);
+      indexInterval = in.readVInt();
       if (indexInterval < 1) {
         throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")");
       }
-      this.indexDivisor = indexDivisor;
-
-      if (indexDivisor < 0) {
-        totalIndexInterval = indexInterval;
-      } else {
-        // In case terms index gets loaded, later, on demand
-        totalIndexInterval = indexInterval * indexDivisor;
-      }
-      assert totalIndexInterval > 0;
+      packedIntsVersion = in.readVInt();
+      blocksize = in.readVInt();
       
       seekDir(in, dirOffset);
 
@@ -112,7 +95,7 @@ public class FixedGapTermsIndexReader ex
       //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
       for(int i=0;i<numFields;i++) {
         final int field = in.readVInt();
-        final int numIndexTerms = in.readVInt();
+        final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
         if (numIndexTerms < 0) {
           throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
         }
@@ -124,47 +107,33 @@ public class FixedGapTermsIndexReader ex
           throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
         }
         final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
+        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(in, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
         if (previous != null) {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
         }
       }
       success = true;
     } finally {
-      if (!success) {
+      if (success) {
+        IOUtils.close(in);
+      } else {
         IOUtils.closeWhileHandlingException(in);
       }
-      if (indexDivisor > 0) {
-        in.close();
-        in = null;
-        if (success) {
-          indexLoaded = true;
-        }
-        termBytesReader = termBytes.freeze(true);
-      }
+      termBytesReader = termBytes.freeze(true);
     }
   }
-  
-  @Override
-  public int getDivisor() {
-    return indexDivisor;
-  }
 
-  private int readHeader(IndexInput input) throws IOException {
-    int version = CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
-      FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_CURRENT);
-    if (version < FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
-      dirOffset = input.readLong();
-    }
-    return version;
+  private void readHeader(IndexInput input) throws IOException {
+    CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
+      FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT);
   }
 
   private class IndexEnum extends FieldIndexEnum {
-    private final FieldIndexData.CoreFieldIndex fieldIndex;
+    private final FieldIndexData fieldIndex;
     private final BytesRef term = new BytesRef();
     private long ord;
 
-    public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
+    public IndexEnum(FieldIndexData fieldIndex) {
       this.fieldIndex = fieldIndex;
     }
 
@@ -175,12 +144,11 @@ public class FixedGapTermsIndexReader ex
 
     @Override
     public long seek(BytesRef target) {
-      int lo = 0;          // binary search
-      int hi = fieldIndex.numIndexTerms - 1;
-      assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
+      long lo = 0;          // binary search
+      long hi = fieldIndex.numIndexTerms - 1;
 
       while (hi >= lo) {
-        int mid = (lo + hi) >>> 1;
+        long mid = (lo + hi) >>> 1;
 
         final long offset = fieldIndex.termOffsets.get(mid);
         final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
@@ -193,7 +161,7 @@ public class FixedGapTermsIndexReader ex
           lo = mid + 1;
         } else {
           assert mid >= 0;
-          ord = mid*totalIndexInterval;
+          ord = mid*indexInterval;
           return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
         }
       }
@@ -207,17 +175,17 @@ public class FixedGapTermsIndexReader ex
       final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
 
-      ord = hi*totalIndexInterval;
+      ord = hi*indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
     }
 
     @Override
     public long next() {
-      final int idx = 1 + (int) (ord / totalIndexInterval);
+      final long idx = 1 + (ord / indexInterval);
       if (idx >= fieldIndex.numIndexTerms) {
         return -1;
       }
-      ord += totalIndexInterval;
+      ord += indexInterval;
 
       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
@@ -232,13 +200,13 @@ public class FixedGapTermsIndexReader ex
 
     @Override
     public long seek(long ord) {
-      int idx = (int) (ord / totalIndexInterval);
+      long idx = ord / indexInterval;
       // caller must ensure ord is in bounds
       assert idx < fieldIndex.numIndexTerms;
       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
-      this.ord = idx * totalIndexInterval;
+      this.ord = idx * indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
     }
   }
@@ -249,176 +217,58 @@ public class FixedGapTermsIndexReader ex
   }
 
   private final class FieldIndexData {
-
-    volatile CoreFieldIndex coreIndex;
-
-    private final long indexStart;
-    private final long termsStart;
-    private final long packedIndexStart;
-    private final long packedOffsetsStart;
-
-    private final int numIndexTerms;
-
-    public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
-                          long packedOffsetsStart) throws IOException {
-
+    // where this field's terms begin in the packed byte[]
+    // data
+    final long termBytesStart;
+    
+    // offset into index termBytes
+    final MonotonicBlockPackedReader termOffsets;
+    
+    // index pointers into main terms dict
+    final MonotonicBlockPackedReader termsDictOffsets;
+    
+    final long numIndexTerms;
+    final long termsStart;
+    
+    public FieldIndexData(IndexInput in, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {
+      
       this.termsStart = termsStart;
-      this.indexStart = indexStart;
-      this.packedIndexStart = packedIndexStart;
-      this.packedOffsetsStart = packedOffsetsStart;
+      termBytesStart = termBytes.getPointer();
+      
+      IndexInput clone = in.clone();
+      clone.seek(indexStart);
+      
       this.numIndexTerms = numIndexTerms;
-
-      if (indexDivisor > 0) {
-        loadTermsIndex();
-      }
-    }
-
-    private void loadTermsIndex() throws IOException {
-      if (coreIndex == null) {
-        coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
-      }
-    }
-
-    private final class CoreFieldIndex {
-
-      // where this field's terms begin in the packed byte[]
-      // data
-      final long termBytesStart;
-
-      // offset into index termBytes
-      final PackedInts.Reader termOffsets;
-
-      // index pointers into main terms dict
-      final PackedInts.Reader termsDictOffsets;
-
-      final int numIndexTerms;
-      final long termsStart;
-
-      public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
-
-        this.termsStart = termsStart;
-        termBytesStart = termBytes.getPointer();
-
-        IndexInput clone = in.clone();
-        clone.seek(indexStart);
-
-        // -1 is passed to mean "don't load term index", but
-        // if we are then later loaded it's overwritten with
-        // a real value
-        assert indexDivisor > 0;
-
-        this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
-
-        assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;
-
-        if (indexDivisor == 1) {
-          // Default (load all index terms) is fast -- slurp in the images from disk:
-          
-          try {
-            final long numTermBytes = packedIndexStart - indexStart;
-            termBytes.copy(clone, numTermBytes);
-
-            // records offsets into main terms dict file
-            termsDictOffsets = PackedInts.getReader(clone);
-            assert termsDictOffsets.size() == numIndexTerms;
-
-            // records offsets into byte[] term data
-            termOffsets = PackedInts.getReader(clone);
-            assert termOffsets.size() == 1+numIndexTerms;
-          } finally {
-            clone.close();
-          }
-        } else {
-          // Get packed iterators
-          final IndexInput clone1 = in.clone();
-          final IndexInput clone2 = in.clone();
-
-          try {
-            // Subsample the index terms
-            clone1.seek(packedIndexStart);
-            final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1, PackedInts.DEFAULT_BUFFER_SIZE);
-
-            clone2.seek(packedOffsetsStart);
-            final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2,  PackedInts.DEFAULT_BUFFER_SIZE);
-
-            // TODO: often we can get by w/ fewer bits per
-            // value, below.. .but this'd be more complex:
-            // we'd have to try @ fewer bits and then grow
-            // if we overflowed it.
-
-            PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
-            PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue(), PackedInts.DEFAULT);
-
-            termsDictOffsets = termsDictOffsetsM;
-            termOffsets = termOffsetsM;
-
-            int upto = 0;
-
-            long termOffsetUpto = 0;
-
-            while(upto < this.numIndexTerms) {
-              // main file offset copies straight over
-              termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
-
-              termOffsetsM.set(upto, termOffsetUpto);
-
-              long termOffset = termOffsetsIter.next();
-              long nextTermOffset = termOffsetsIter.next();
-              final int numTermBytes = (int) (nextTermOffset - termOffset);
-
-              clone.seek(indexStart + termOffset);
-              assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
-              assert indexStart + termOffset + numTermBytes < clone.length();
-
-              termBytes.copy(clone, numTermBytes);
-              termOffsetUpto += numTermBytes;
-
-              upto++;
-              if (upto == this.numIndexTerms) {
-                break;
-              }
-
-              // skip terms:
-              termsDictOffsetsIter.next();
-              for(int i=0;i<indexDivisor-2;i++) {
-                termOffsetsIter.next();
-                termsDictOffsetsIter.next();
-              }
-            }
-            termOffsetsM.set(upto, termOffsetUpto);
-
-          } finally {
-            clone1.close();
-            clone2.close();
-            clone.close();
-          }
-        }
+      assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms;
+      
+      // slurp in the images from disk:
+      
+      try {
+        final long numTermBytes = packedIndexStart - indexStart;
+        termBytes.copy(clone, numTermBytes);
+        
+        // records offsets into main terms dict file
+        termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false);
+        
+        // records offsets into byte[] term data
+        termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
+      } finally {
+        clone.close();
       }
     }
   }
 
   @Override
   public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
-    final FieldIndexData fieldData = fields.get(fieldInfo);
-    if (fieldData.coreIndex == null) {
-      return null;
-    } else {
-      return new IndexEnum(fieldData.coreIndex);
-    }
+    return new IndexEnum(fields.get(fieldInfo));
   }
 
   @Override
-  public void close() throws IOException {
-    if (in != null && !indexLoaded) {
-      in.close();
-    }
-  }
+  public void close() throws IOException {}
 
   private void seekDir(IndexInput input, long dirOffset) throws IOException {
-    if (version >= FixedGapTermsIndexWriter.VERSION_APPEND_ONLY) {
-      input.seek(input.length() - 8);
-      dirOffset = input.readLong();
-    }
+    input.seek(input.length() - 8);
+    dirOffset = input.readLong();
     input.seek(dirOffset);
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java Sun Aug 11 12:19:13 2013
@@ -18,15 +18,16 @@ package org.apache.lucene.codecs.blockte
  */
 
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
 
 import java.util.List;
@@ -50,23 +51,32 @@ public class FixedGapTermsIndexWriter ex
   final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
   final static int VERSION_START = 0;
   final static int VERSION_APPEND_ONLY = 1;
-  final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
+  final static int VERSION_MONOTONIC_ADDRESSING = 2;
+  final static int VERSION_CURRENT = VERSION_MONOTONIC_ADDRESSING;
 
+  final static int BLOCKSIZE = 4096;
   final private int termIndexInterval;
+  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
 
   private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
   
-  @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
-
   public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
+    this(state, DEFAULT_TERM_INDEX_INTERVAL);
+  }
+  
+  public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval) throws IOException {
+    if (termIndexInterval <= 0) {
+      throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
+    }
+    this.termIndexInterval = termIndexInterval;
     final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
-    termIndexInterval = state.termIndexInterval;
     out = state.directory.createOutput(indexFileName, state.context);
     boolean success = false;
     try {
-      fieldInfos = state.fieldInfos;
       writeHeader(out);
-      out.writeInt(termIndexInterval);
+      out.writeVInt(termIndexInterval);
+      out.writeVInt(PackedInts.VERSION_CURRENT);
+      out.writeVInt(BLOCKSIZE);
       success = true;
     } finally {
       if (!success) {
@@ -114,22 +124,25 @@ public class FixedGapTermsIndexWriter ex
     long packedOffsetsStart;
     private long numTerms;
 
-    // TODO: we could conceivably make a PackedInts wrapper
-    // that auto-grows... then we wouldn't force 6 bytes RAM
-    // per index term:
-    private short[] termLengths;
-    private int[] termsPointerDeltas;
-    private long lastTermsPointer;
-    private long totTermLength;
+    private RAMOutputStream offsetsBuffer = new RAMOutputStream();
+    private MonotonicBlockPackedWriter termOffsets = new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
+    private long currentOffset;
+
+    private RAMOutputStream addressBuffer = new RAMOutputStream();
+    private MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);
 
     private final BytesRef lastTerm = new BytesRef();
 
     SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
       this.fieldInfo = fieldInfo;
       indexStart = out.getFilePointer();
-      termsStart = lastTermsPointer = termsFilePointer;
-      termLengths = new short[0];
-      termsPointerDeltas = new int[0];
+      termsStart = termsFilePointer;
+      // we write terms+1 offsets, term n's length is n+1 - n
+      try {
+        termOffsets.add(0L);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
     }
 
     @Override
@@ -157,21 +170,13 @@ public class FixedGapTermsIndexWriter ex
       // against prior term
       out.writeBytes(text.bytes, text.offset, indexedTermLength);
 
-      if (termLengths.length == numIndexTerms) {
-        termLengths = ArrayUtil.grow(termLengths);
-      }
-      if (termsPointerDeltas.length == numIndexTerms) {
-        termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
-      }
-
       // save delta terms pointer
-      termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
-      lastTermsPointer = termsFilePointer;
+      termAddresses.add(termsFilePointer - termsStart);
 
       // save term length (in bytes)
       assert indexedTermLength <= Short.MAX_VALUE;
-      termLengths[numIndexTerms] = (short) indexedTermLength;
-      totTermLength += indexedTermLength;
+      currentOffset += indexedTermLength;
+      termOffsets.add(currentOffset);
 
       lastTerm.copyBytes(text);
       numIndexTerms++;
@@ -183,32 +188,20 @@ public class FixedGapTermsIndexWriter ex
       // write primary terms dict offsets
       packedIndexStart = out.getFilePointer();
 
-      PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer), PackedInts.DEFAULT);
-
       // relative to our indexStart
-      long upto = 0;
-      for(int i=0;i<numIndexTerms;i++) {
-        upto += termsPointerDeltas[i];
-        w.add(upto);
-      }
-      w.finish();
+      termAddresses.finish();
+      addressBuffer.writeTo(out);
 
       packedOffsetsStart = out.getFilePointer();
 
       // write offsets into the byte[] terms
-      w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength), PackedInts.DEFAULT);
-      upto = 0;
-      for(int i=0;i<numIndexTerms;i++) {
-        w.add(upto);
-        upto += termLengths[i];
-      }
-      w.add(upto);
-      w.finish();
+      termOffsets.finish();
+      offsetsBuffer.writeTo(out);
 
       // our referrer holds onto us, while other fields are
       // being written, so don't tie up this RAM:
-      termLengths = null;
-      termsPointerDeltas = null;
+      termOffsets = termAddresses = null;
+      addressBuffer = offsetsBuffer = null;
     }
   }
 

Modified: lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/TermsIndexReaderBase.java Sun Aug 11 12:19:13 2013
@@ -47,8 +47,6 @@ public abstract class TermsIndexReaderBa
 
   public abstract boolean supportsOrd();
 
-  public abstract int getDivisor();
-
   /** 
    * Similar to TermsEnum, except, the only "metadata" it
    * reports for a given indexed term is the long fileOffset