You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/03/19 12:34:34 UTC

svn commit: r925179 [1/2] - in /lucene/nutch/trunk: ./ lib/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/field/ src/java/org/apache/nutch/indexer/lucene/ src/java/org/apache/nutch/metadata/ sr...

Author: ab
Date: Fri Mar 19 11:34:33 2010
New Revision: 925179

URL: http://svn.apache.org/viewvc?rev=925179&view=rev
Log:
NUTCH-787 Upgrade to Lucene 3.0.1.

Added:
    lucene/nutch/trunk/lib/lucene-core-3.0.1.jar   (with props)
    lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar   (with props)
    lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar   (with props)
    lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar   (with props)
Removed:
    lucene/nutch/trunk/lib/lucene-core-2.9.1.jar
    lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar
    lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar
    lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
    lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
    lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
    lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
    lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
    lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
    lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
    lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
    lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
    lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
    lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
    lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
    lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar 19 11:34:33 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab)
+
 * NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging (ab)
 
 * NUTCH-801 Remove RTF and MP3 parse plugins (jnioche)

Added: lucene/nutch/trunk/lib/lucene-core-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-3.0.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Fri Mar 19 11:34:33 2010
@@ -17,23 +17,21 @@
 
 package org.apache.nutch.analysis;
 
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
-
 import java.io.*;
 import java.util.*;
 
-// Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.conf.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.nutch.searcher.Query.Phrase;
+import org.apache.nutch.searcher.Query.Term;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.ObjectCache;
-import org.apache.nutch.searcher.Query.*;
 
-/** Construct n-grams for frequently occuring terms and phrases while indexing.
+/** Construct n-grams for frequently occurring terms and phrases while indexing.
  * Optimize phrase queries to use the n-grams. Single terms are still indexed
  * too, with n-grams overlaid.  This is achieved through the use of {@link
  * Token#setPositionIncrement(int)}.*/
@@ -61,10 +59,44 @@ public class CommonGrams {
     private LinkedList<Token> nextQueue = new LinkedList<Token>();
     private StringBuffer buffer = new StringBuffer();
 
+    private final TermAttribute termAtt;
+    private final PositionIncrementAttribute posIncrAtt;
+    private final TypeAttribute typeAtt;
+    private final OffsetAttribute offsetAtt;
+
     /** Construct an n-gram producing filter. */
     public Filter(TokenStream input, HashSet<String> common) {
       super(input);
       this.common = common;
+      this.termAtt = getAttribute(TermAttribute.class);
+      this.offsetAtt = getAttribute(OffsetAttribute.class);
+      this.posIncrAtt = getAttribute(PositionIncrementAttribute.class);
+      this.typeAtt = addAttribute(TypeAttribute.class);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      clearAttributes();
+      Token t = next();
+      if (t != null) {
+        termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+        offsetAtt.setOffset(t.startOffset(), t.endOffset());
+        posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+        typeAtt.setType(t.type());
+      }     
+      return t != null;
+    }
+
+    private Token inputNext() throws IOException {
+      if (super.input.incrementToken()) {
+        Token t = new Token(
+            termAtt.termBuffer(), 0, termAtt.termLength(),
+            offsetAtt.startOffset(), offsetAtt.endOffset());
+        t.setPositionIncrement(posIncrAtt.getPositionIncrement());
+        t.setType(typeAtt.type());
+        return t;
+      }
+      return null;
     }
 
     /** Inserts n-grams into a token stream. */
@@ -103,7 +135,7 @@ public class CommonGrams {
 
     /** True iff token is for a common term. */
     private boolean isCommon(Token token) {
-      return common != null && common.contains(token.termText());
+      return common != null && common.contains(token.term());
     }
 
     /** Pops nextQueue or, if empty, reads a new token. */
@@ -111,13 +143,13 @@ public class CommonGrams {
       if (nextQueue.size() > 0)
         return nextQueue.removeFirst();
       else
-        return input.next();
+        return inputNext();
     }
 
     /** Return next token in nextQueue, extending it when empty. */
     private Token peekNext(ListIterator<Token> i) throws IOException {
       if (!i.hasNext()) {
-        Token next = input.next();
+        Token next = inputNext();
         if (next == null)
           return null;
         i.add(next);
@@ -129,9 +161,9 @@ public class CommonGrams {
     /** Construct a compound token. */
     private Token gramToken(Token first, Token second) {
       buffer.setLength(0);
-      buffer.append(first.termText());
+      buffer.append(first.term());
       buffer.append(SEPARATOR);
-      buffer.append(second.termText());
+      buffer.append(second.term());
       Token result = new Token(buffer.toString(),
                                first.startOffset(), second.endOffset(),
                                "gram");
@@ -159,24 +191,23 @@ public class CommonGrams {
         if (line.startsWith("#") || "".equals(line)) // skip comments
           continue;
         TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
-        Token token = ts.next();
-        if (token == null) {
+        TermAttribute ta = ts.getAttribute(TermAttribute.class);
+        if (!ts.incrementToken()) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Line does not contain a field name: " + line);
           }
           continue;
         }
-        String field = token.termText();
-        token = ts.next();
-        if (token == null) {
+        String field = ta.term();
+        if (!ts.incrementToken()) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Line contains only a field name, no word: " + line);
           }
           continue;
         }
-        String gram = token.termText();
-        while ((token = ts.next()) != null) {
-          gram = gram + SEPARATOR + token.termText();
+        String gram = ta.term();
+        while (ts.incrementToken()) {
+          gram = gram + SEPARATOR + ta.term();
         }
         HashSet<String> table = commonTerms.get(field);
         if (table == null) {
@@ -201,16 +232,27 @@ public class CommonGrams {
   private static class ArrayTokens extends TokenStream {
     private Term[] terms;
     private int index;
+    private final TermAttribute termAttr;
+    private final PositionIncrementAttribute posAttr;
+    private final OffsetAttribute offsetAttr;
 
     public ArrayTokens(Phrase phrase) {
       this.terms = phrase.getTerms();
+      this.termAttr = addAttribute(TermAttribute.class);
+      this.posAttr = addAttribute(PositionIncrementAttribute.class);
+      this.offsetAttr = addAttribute(OffsetAttribute.class);
     }
 
-    public Token next() {
+    @Override
+    public boolean incrementToken() throws IOException {
       if (index == terms.length)
-        return null;
-      else
-        return new Token(terms[index].toString(), index, ++index);
+        return false;
+
+      clearAttributes();
+      termAttr.setTermBuffer(terms[index].toString());
+      posAttr.setPositionIncrement(1);
+      offsetAttr.setOffset(index, ++index);
+      return true;
     }
   }
 
@@ -222,22 +264,24 @@ public class CommonGrams {
     }
     ArrayList<String> result = new ArrayList<String>();
     TokenStream ts = getFilter(new ArrayTokens(phrase), field);
-    Token token, prev=null;
+    String prev = null;
+    TermAttribute ta = ts.getAttribute(TermAttribute.class);
+    PositionIncrementAttribute pa = ts.getAttribute(PositionIncrementAttribute.class);
     int position = 0;
     try {
-      while ((token = ts.next()) != null) {
-        if (token.getPositionIncrement() != 0 && prev != null)
-          result.add(prev.termText());
-        prev = token;
-        position += token.getPositionIncrement();
-        if ((position + arity(token.termText())) == phrase.getTerms().length)
+      while (ts.incrementToken()) {
+        if (pa.getPositionIncrement() != 0 && prev != null)
+          result.add(prev);
+        prev = ta.term();
+        position += pa.getPositionIncrement();
+        if ((position + arity(ta.term())) == phrase.getTerms().length)
           break;
       }
     } catch (IOException e) {
       throw new RuntimeException(e.toString());
     }
     if (prev != null)
-      result.add(prev.termText());
+      result.add(prev);
 
     return result.toArray(new String[result.size()]);
   }
@@ -261,9 +305,12 @@ public class CommonGrams {
     TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
     CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
     ts = commonGrams.getFilter(ts, "url");
-    Token token;
-    while ((token = ts.next()) != null) {
-      System.out.println("Token: " + token);
+    TermAttribute ta = ts.getAttribute(TermAttribute.class);
+    OffsetAttribute oa = ts.getAttribute(OffsetAttribute.class);
+    PositionIncrementAttribute pia = ts.getAttribute(PositionIncrementAttribute.class);
+    while (ts.incrementToken()) {
+      System.out.println("Token: " + ta.term() + " offs:" + oa.startOffset() + "-" + oa.endOffset()
+          + " incr: " + pia.getPositionIncrement());
     }
     String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
     System.out.print("Optimized: ");

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java Fri Mar 19 11:34:33 2010
@@ -10,6 +10,7 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 import java.io.*;
 import java.util.*;
@@ -72,7 +73,7 @@ public class NutchAnalysis implements Nu
 /** Parse a query. */
   final public Query parse(Configuration conf) throws ParseException {
   Query query = new Query(conf);
-  ArrayList<String> terms;
+  ArrayList terms;
   Token token;
   String field;
   boolean stop;
@@ -140,7 +141,7 @@ public class NutchAnalysis implements Nu
         throw new ParseException();
       }
       nonOpOrTerm();
-      String[] array = terms.toArray(new String[terms.size()]);
+      String[] array = (String[])terms.toArray(new String[terms.size()]);
 
       if (stop
           && field == Clause.DEFAULT_FIELD
@@ -160,10 +161,10 @@ public class NutchAnalysis implements Nu
 
 /** Parse an explcitly quoted phrase query.  Note that this may return a single
  * term, a trivial phrase.*/
-  final public ArrayList<String> phrase(String field) throws ParseException {
+  final public ArrayList phrase(String field) throws ParseException {
   int start;
   int end;
-  ArrayList<String> result = new ArrayList<String>();
+  ArrayList result = new ArrayList();
   String term;
     jj_consume_token(QUOTE);
     start = token.endColumn;
@@ -243,10 +244,10 @@ public class NutchAnalysis implements Nu
 
 /** Parse a compound term that is interpreted as an implicit phrase query.
  * Compounds are a sequence of terms separated by infix characters.  Note that
- * htis may return a single term, a trivial compound. */
-  final public ArrayList<String> compound(String field) throws ParseException {
+ * this may return a single term, a trivial compound. */
+  final public ArrayList compound(String field) throws ParseException {
   int start;
-  ArrayList<String> result = new ArrayList<String>();
+  ArrayList result = new ArrayList();
   String term;
   StringBuffer terms = new StringBuffer();
     start = token.endColumn;
@@ -289,19 +290,23 @@ public class NutchAnalysis implements Nu
       result.add(queryString.substring(start, token.endColumn));
 
     } else {
-      org.apache.lucene.analysis.Token token;
       TokenStream tokens = analyzer.tokenStream(
                               field, new StringReader(terms.toString()));
 
-      while (true) {
-        try {
-          token = tokens.next();
-        } catch (IOException e) {
-          token = null;
+      TermAttribute ta = tokens.getAttribute(TermAttribute.class);
+      try
+      {
+        String termText;
+        while (tokens.incrementToken())
+        {
+          if ((termText = ta.term()) == null)
+            break;
+          result.add(termText);
         }
-        if (token == null) { break; }
-        result.add(token.termText());
+      } catch (IOException e) {
+        // ignore (?)
       }
+//
       try {
         tokens.close();
       } catch (IOException e) {
@@ -470,76 +475,77 @@ public class NutchAnalysis implements Nu
     }
   }
 
-  final private boolean jj_2_1(int xla) {
+  private boolean jj_2_1(int xla) {
     jj_la = xla; jj_lastpos = jj_scanpos = token;
     try { return !jj_3_1(); }
     catch(LookaheadSuccess ls) { return true; }
     finally { jj_save(0, xla); }
   }
 
-  final private boolean jj_2_2(int xla) {
+  private boolean jj_2_2(int xla) {
     jj_la = xla; jj_lastpos = jj_scanpos = token;
     try { return !jj_3_2(); }
     catch(LookaheadSuccess ls) { return true; }
     finally { jj_save(1, xla); }
   }
 
-  final private boolean jj_2_3(int xla) {
+  private boolean jj_2_3(int xla) {
     jj_la = xla; jj_lastpos = jj_scanpos = token;
     try { return !jj_3_3(); }
     catch(LookaheadSuccess ls) { return true; }
     finally { jj_save(2, xla); }
   }
 
-  final private boolean jj_3_1() {
-    if (jj_scan_token(WORD)) return true;
-    if (jj_scan_token(COLON)) return true;
+  private boolean jj_3_3() {
     Token xsp;
     xsp = jj_scanpos;
-    if (jj_3R_8()) {
+    if (jj_scan_token(15)) {
     jj_scanpos = xsp;
-    if (jj_3R_9()) return true;
+    if (jj_3R_12()) {
+    jj_scanpos = xsp;
+    if (jj_3R_13()) return true;
     }
+    }
+    return false;
+  }
+
+  private boolean jj_3R_27() {
+    if (jj_3R_16()) return true;
     return false;
   }
 
-  final private boolean jj_3R_16() {
+  private boolean jj_3R_25() {
+    if (jj_3R_24()) return true;
+    return false;
+  }
+
+  private boolean jj_3R_23() {
+    if (jj_3R_24()) return true;
+    return false;
+  }
+
+  private boolean jj_3R_18() {
     Token xsp;
     xsp = jj_scanpos;
-    if (jj_scan_token(7)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(8)) {
+    if (jj_3R_23()) {
     jj_scanpos = xsp;
-    if (jj_3R_22()) return true;
-    }
+    if (jj_scan_token(0)) return true;
     }
     return false;
   }
 
-  final private boolean jj_3_3() {
+  private boolean jj_3R_13() {
     Token xsp;
     xsp = jj_scanpos;
-    if (jj_scan_token(15)) {
-    jj_scanpos = xsp;
-    if (jj_3R_12()) {
+    if (jj_scan_token(7)) {
     jj_scanpos = xsp;
-    if (jj_3R_13()) return true;
-    }
+    if (jj_scan_token(8)) return true;
     }
+    if (jj_3R_18()) return true;
     return false;
   }
 
-  final private boolean jj_3R_25() {
-    if (jj_3R_24()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_27() {
-    if (jj_3R_16()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_20() {
+  private boolean jj_3R_20() {
     if (jj_3R_11()) return true;
     Token xsp;
     while (true) {
@@ -549,17 +555,17 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_10() {
+  private boolean jj_3R_10() {
     if (jj_3R_16()) return true;
     return false;
   }
 
-  final private boolean jj_3R_19() {
+  private boolean jj_3R_19() {
     if (jj_3R_24()) return true;
     return false;
   }
 
-  final private boolean jj_3_2() {
+  private boolean jj_3_2() {
     Token xsp;
     if (jj_3R_10()) return true;
     while (true) {
@@ -570,38 +576,22 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_23() {
-    if (jj_3R_24()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_18() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_3R_23()) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(0)) return true;
-    }
+  private boolean jj_3R_9() {
+    if (jj_3R_15()) return true;
     return false;
   }
 
-  final private boolean jj_3R_13() {
+  private boolean jj_3R_24() {
     Token xsp;
     xsp = jj_scanpos;
-    if (jj_scan_token(7)) {
+    if (jj_scan_token(15)) {
     jj_scanpos = xsp;
-    if (jj_scan_token(8)) return true;
+    if (jj_3R_27()) return true;
     }
-    if (jj_3R_18()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_9() {
-    if (jj_3R_15()) return true;
     return false;
   }
 
-  final private boolean jj_3R_14() {
+  private boolean jj_3R_14() {
     if (jj_scan_token(QUOTE)) return true;
     Token xsp;
     while (true) {
@@ -620,22 +610,17 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_24() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(15)) {
-    jj_scanpos = xsp;
-    if (jj_3R_27()) return true;
-    }
+  private boolean jj_3R_26() {
+    if (jj_3R_16()) return true;
     return false;
   }
 
-  final private boolean jj_3R_26() {
-    if (jj_3R_16()) return true;
+  private boolean jj_3R_22() {
+    if (jj_3R_17()) return true;
     return false;
   }
 
-  final private boolean jj_3R_21() {
+  private boolean jj_3R_21() {
     Token xsp;
     if (jj_3R_26()) return true;
     while (true) {
@@ -646,22 +631,12 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_22() {
-    if (jj_3R_17()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_8() {
-    if (jj_3R_14()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_12() {
+  private boolean jj_3R_12() {
     if (jj_3R_17()) return true;
     return false;
   }
 
-  final private boolean jj_3R_11() {
+  private boolean jj_3R_11() {
     Token xsp;
     xsp = jj_scanpos;
     if (jj_scan_token(1)) {
@@ -674,7 +649,12 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_15() {
+  private boolean jj_3R_8() {
+    if (jj_3R_14()) return true;
+    return false;
+  }
+
+  private boolean jj_3R_15() {
     if (jj_3R_11()) return true;
     Token xsp;
     while (true) {
@@ -684,7 +664,7 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
-  final private boolean jj_3R_17() {
+  private boolean jj_3R_17() {
     Token xsp;
     xsp = jj_scanpos;
     if (jj_scan_token(10)) {
@@ -703,25 +683,54 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
+  private boolean jj_3_1() {
+    if (jj_scan_token(WORD)) return true;
+    if (jj_scan_token(COLON)) return true;
+    Token xsp;
+    xsp = jj_scanpos;
+    if (jj_3R_8()) {
+    jj_scanpos = xsp;
+    if (jj_3R_9()) return true;
+    }
+    return false;
+  }
+
+  private boolean jj_3R_16() {
+    Token xsp;
+    xsp = jj_scanpos;
+    if (jj_scan_token(7)) {
+    jj_scanpos = xsp;
+    if (jj_scan_token(8)) {
+    jj_scanpos = xsp;
+    if (jj_3R_22()) return true;
+    }
+    }
+    return false;
+  }
+
+  /** Generated Token Manager. */
   public NutchAnalysisTokenManager token_source;
-  public Token token, jj_nt;
+  /** Current token. */
+  public Token token;
+  /** Next token. */
+  public Token jj_nt;
   private int jj_ntk;
   private Token jj_scanpos, jj_lastpos;
   private int jj_la;
-  public boolean lookingAhead = false;
   private int jj_gen;
   final private int[] jj_la1 = new int[16];
   static private int[] jj_la1_0;
   static {
-      jj_la1_0();
+      jj_la1_init_0();
    }
-   private static void jj_la1_0() {
+   private static void jj_la1_init_0() {
       jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,};
    }
   final private JJCalls[] jj_2_rtns = new JJCalls[3];
   private boolean jj_rescan = false;
   private int jj_gc = 0;
 
+  /** Constructor with user supplied CharStream. */
   public NutchAnalysis(CharStream stream) {
     token_source = new NutchAnalysisTokenManager(stream);
     token = new Token();
@@ -731,6 +740,7 @@ public class NutchAnalysis implements Nu
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
+  /** Reinitialise. */
   public void ReInit(CharStream stream) {
     token_source.ReInit(stream);
     token = new Token();
@@ -740,6 +750,7 @@ public class NutchAnalysis implements Nu
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
+  /** Constructor with generated Token Manager. */
   public NutchAnalysis(NutchAnalysisTokenManager tm) {
     token_source = tm;
     token = new Token();
@@ -749,6 +760,7 @@ public class NutchAnalysis implements Nu
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
+  /** Reinitialise. */
   public void ReInit(NutchAnalysisTokenManager tm) {
     token_source = tm;
     token = new Token();
@@ -758,7 +770,7 @@ public class NutchAnalysis implements Nu
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
-  final private Token jj_consume_token(int kind) throws ParseException {
+  private Token jj_consume_token(int kind) throws ParseException {
     Token oldToken;
     if ((oldToken = token).next != null) token = token.next;
     else token = token.next = token_source.getNextToken();
@@ -782,10 +794,9 @@ public class NutchAnalysis implements Nu
     throw generateParseException();
   }
 
-  @SuppressWarnings("serial")
   static private final class LookaheadSuccess extends java.lang.Error { }
   final private LookaheadSuccess jj_ls = new LookaheadSuccess();
-  final private boolean jj_scan_token(int kind) {
+  private boolean jj_scan_token(int kind) {
     if (jj_scanpos == jj_lastpos) {
       jj_la--;
       if (jj_scanpos.next == null) {
@@ -806,6 +817,8 @@ public class NutchAnalysis implements Nu
     return false;
   }
 
+
+/** Get the next Token. */
   final public Token getNextToken() {
     if (token.next != null) token = token.next;
     else token = token.next = token_source.getNextToken();
@@ -814,8 +827,9 @@ public class NutchAnalysis implements Nu
     return token;
   }
 
+/** Get the specific Token. */
   final public Token getToken(int index) {
-    Token t = lookingAhead ? jj_scanpos : token;
+    Token t = token;
     for (int i = 0; i < index; i++) {
       if (t.next != null) t = t.next;
       else t = t.next = token_source.getNextToken();
@@ -823,14 +837,14 @@ public class NutchAnalysis implements Nu
     return t;
   }
 
-  final private int jj_ntk() {
+  private int jj_ntk() {
     if ((jj_nt=token.next) == null)
       return (jj_ntk = (token.next=token_source.getNextToken()).kind);
     else
       return (jj_ntk = jj_nt.kind);
   }
 
-  private java.util.Vector<int[]> jj_expentries = new java.util.Vector<int[]>();
+  private java.util.List jj_expentries = new java.util.ArrayList();
   private int[] jj_expentry;
   private int jj_kind = -1;
   private int[] jj_lasttokens = new int[100];
@@ -845,31 +859,26 @@ public class NutchAnalysis implements Nu
       for (int i = 0; i < jj_endpos; i++) {
         jj_expentry[i] = jj_lasttokens[i];
       }
-      boolean exists = false;
-      for (java.util.Enumeration<int[]> e = jj_expentries.elements(); e.hasMoreElements();) {
-        int[] oldentry = (e.nextElement());
+      jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
+        int[] oldentry = (int[])(it.next());
         if (oldentry.length == jj_expentry.length) {
-          exists = true;
           for (int i = 0; i < jj_expentry.length; i++) {
             if (oldentry[i] != jj_expentry[i]) {
-              exists = false;
-              break;
+              continue jj_entries_loop;
             }
           }
-          if (exists) break;
+          jj_expentries.add(jj_expentry);
+          break jj_entries_loop;
         }
       }
-      if (!exists) jj_expentries.addElement(jj_expentry);
       if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
     }
   }
 
+  /** Generate ParseException. */
   public ParseException generateParseException() {
-    jj_expentries.removeAllElements();
+    jj_expentries.clear();
     boolean[] la1tokens = new boolean[20];
-    for (int i = 0; i < 20; i++) {
-      la1tokens[i] = false;
-    }
     if (jj_kind >= 0) {
       la1tokens[jj_kind] = true;
       jj_kind = -1;
@@ -887,7 +896,7 @@ public class NutchAnalysis implements Nu
       if (la1tokens[i]) {
         jj_expentry = new int[1];
         jj_expentry[0] = i;
-        jj_expentries.addElement(jj_expentry);
+        jj_expentries.add(jj_expentry);
       }
     }
     jj_endpos = 0;
@@ -895,18 +904,20 @@ public class NutchAnalysis implements Nu
     jj_add_error_token(0, 0);
     int[][] exptokseq = new int[jj_expentries.size()][];
     for (int i = 0; i < jj_expentries.size(); i++) {
-      exptokseq[i] = jj_expentries.elementAt(i);
+      exptokseq[i] = (int[])jj_expentries.get(i);
     }
     return new ParseException(token, exptokseq, tokenImage);
   }
 
+  /** Enable tracing. */
   final public void enable_tracing() {
   }
 
+  /** Disable tracing. */
   final public void disable_tracing() {
   }
 
-  final private void jj_rescan_token() {
+  private void jj_rescan_token() {
     jj_rescan = true;
     for (int i = 0; i < 3; i++) {
     try {
@@ -927,7 +938,7 @@ public class NutchAnalysis implements Nu
     jj_rescan = false;
   }
 
-  final private void jj_save(int index, int xla) {
+  private void jj_save(int index, int xla) {
     JJCalls p = jj_2_rtns[index];
     while (p.gen > jj_gen) {
       if (p.next == null) { p = p.next = new JJCalls(); break; }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj Fri Mar 19 11:34:33 2010
@@ -38,6 +38,7 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 import java.io.*;
 import java.util.*;
@@ -277,7 +278,7 @@ ArrayList phrase(String field) :
 
 /** Parse a compound term that is interpreted as an implicit phrase query.
  * Compounds are a sequence of terms separated by infix characters.  Note that
- * htis may return a single term, a trivial compound. */
+ * this may return a single term, a trivial compound. */
 ArrayList compound(String field) :
 {
   int start;
@@ -305,19 +306,23 @@ ArrayList compound(String field) :
       result.add(queryString.substring(start, token.endColumn));
 
     } else {
-      org.apache.lucene.analysis.Token token;
       TokenStream tokens = analyzer.tokenStream(
                               field, new StringReader(terms.toString()));
 
-      while (true) {
-        try {
-          token = tokens.next();
-        } catch (IOException e) {
-          token = null;
+      TermAttribute ta = tokens.getAttribute(TermAttribute.class);
+      try
+      {
+        String termText;
+        while (tokens.incrementToken())
+        {
+          if ((termText = ta.term()) == null)
+            break;
+          result.add(termText);
         }
-        if (token == null) { break; }
-        result.add(token.termText());
+      } catch (IOException e) {
+        // ignore (?)
       }
+//
       try {
         tokens.close();
       } catch (IOException e) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java Fri Mar 19 11:34:33 2010
@@ -1,31 +1,58 @@
 /* Generated By:JavaCC: Do not edit this line. NutchAnalysisConstants.java */
 package org.apache.nutch.analysis;
 
+
+/** 
+ * Token literal values and constants.
+ * Generated by org.javacc.parser.OtherFilesGen#start()
+ */
 public interface NutchAnalysisConstants {
 
+  /** End of File. */
   int EOF = 0;
+  /** RegularExpression Id. */
   int WORD = 1;
+  /** RegularExpression Id. */
   int ACRONYM = 2;
+  /** RegularExpression Id. */
   int SIGRAM = 3;
+  /** RegularExpression Id. */
   int IRREGULAR_WORD = 4;
+  /** RegularExpression Id. */
   int C_PLUS_PLUS = 5;
+  /** RegularExpression Id. */
   int C_SHARP = 6;
+  /** RegularExpression Id. */
   int PLUS = 7;
+  /** RegularExpression Id. */
   int MINUS = 8;
+  /** RegularExpression Id. */
   int QUOTE = 9;
+  /** RegularExpression Id. */
   int COLON = 10;
+  /** RegularExpression Id. */
   int SLASH = 11;
+  /** RegularExpression Id. */
   int DOT = 12;
+  /** RegularExpression Id. */
   int ATSIGN = 13;
+  /** RegularExpression Id. */
   int APOSTROPHE = 14;
+  /** RegularExpression Id. */
   int WHITE = 15;
+  /** RegularExpression Id. */
   int WORD_PUNCT = 16;
+  /** RegularExpression Id. */
   int LETTER = 17;
+  /** RegularExpression Id. */
   int CJK = 18;
+  /** RegularExpression Id. */
   int DIGIT = 19;
 
+  /** Lexical state. */
   int DEFAULT = 0;
 
+  /** Literal token values. */
   String[] tokenImage = {
     "<EOF>",
     "<WORD>",

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java Fri Mar 19 11:34:33 2010
@@ -1,23 +1,38 @@
 /* Generated By:JavaCC: Do not edit this line. NutchAnalysisTokenManager.java */
 package org.apache.nutch.analysis;
-
 import java.io.*;
 
+/** Token Manager. */
 public class NutchAnalysisTokenManager implements NutchAnalysisConstants
 {
   /** Constructs a token manager for the provided Reader. */
   public NutchAnalysisTokenManager(Reader reader) {
     this(new FastCharStream(reader));
   }
+
+  /** Debug output. */
   public  java.io.PrintStream debugStream = System.out;
+  /** Set debug output. */
   public  void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
-private final int jjStopAtPos(int pos, int kind)
+private final int jjStopStringLiteralDfa_0(int pos, long active0)
+{
+   switch (pos)
+   {
+      default :
+         return -1;
+   }
+}
+private final int jjStartNfa_0(int pos, long active0)
+{
+   return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1);
+}
+private int jjStopAtPos(int pos, int kind)
 {
    jjmatchedKind = kind;
    jjmatchedPos = pos;
    return pos + 1;
 }
-private final int jjMoveStringLiteralDfa0_0()
+private int jjMoveStringLiteralDfa0_0()
 {
    switch(curChar)
    {
@@ -41,20 +56,6 @@ private final int jjMoveStringLiteralDfa
          return jjMoveNfa_0(1, 0);
    }
 }
-private final void jjCheckNAdd(int state)
-{
-   if (jjrounds[state] != jjround)
-   {
-      jjstateSet[jjnewStateCnt++] = state;
-      jjrounds[state] = jjround;
-   }
-}
-private final void jjAddStates(int start, int end)
-{
-   do {
-      jjstateSet[jjnewStateCnt++] = jjnextStates[start];
-   } while (start++ != end);
-}
 static final long[] jjbitVec0 = {
    0xfffffffeL, 0x0L, 0x0L, 0x0L
 };
@@ -76,7 +77,7 @@ static final long[] jjbitVec6 = {
 static final long[] jjbitVec7 = {
    0x3fffffffffffL, 0x0L, 0x0L, 0x0L
 };
-private final int jjMoveNfa_0(int startState, int curPos)
+private int jjMoveNfa_0(int startState, int curPos)
 {
    int startsAt = 0;
    jjnewStateCnt = 10;
@@ -257,26 +258,36 @@ private static final boolean jjCanMove_1
          return false;
    }
 }
+
+/** Token literal values. */
 public static final String[] jjstrLiteralImages = {
 "", null, null, null, null, null, null, "\53", "\55", "\42", "\72", "\57", 
 "\56", "\100", "\47", null, null, null, null, null, };
+
+/** Lexer state names. */
 public static final String[] lexStateNames = {
    "DEFAULT", 
 };
 protected CharStream input_stream;
 private final int[] jjrounds = new int[10];
 private final int[] jjstateSet = new int[20];
-StringBuffer image;
-int jjimageLen;
-int lengthOfMatch;
+private final StringBuffer jjimage = new StringBuffer();
+private StringBuffer image = jjimage;
+private int jjimageLen;
+private int lengthOfMatch;
 protected char curChar;
+/** Constructor. */
 public NutchAnalysisTokenManager(CharStream stream){
    input_stream = stream;
 }
+
+/** Constructor. */
 public NutchAnalysisTokenManager(CharStream stream, int lexState){
    this(stream);
    SwitchTo(lexState);
 }
+
+/** Reinitialise parser. */
 public void ReInit(CharStream stream)
 {
    jjmatchedPos = jjnewStateCnt = 0;
@@ -284,18 +295,22 @@ public void ReInit(CharStream stream)
    input_stream = stream;
    ReInitRounds();
 }
-private final void ReInitRounds()
+private void ReInitRounds()
 {
    int i;
    jjround = 0x80000001;
    for (i = 10; i-- > 0;)
       jjrounds[i] = 0x80000000;
 }
+
+/** Reinitialise parser. */
 public void ReInit(CharStream stream, int lexState)
 {
    ReInit(stream);
    SwitchTo(lexState);
 }
+
+/** Switch to specified lex state. */
 public void SwitchTo(int lexState)
 {
    if (lexState >= 1 || lexState < 0)
@@ -306,14 +321,25 @@ public void SwitchTo(int lexState)
 
 protected Token jjFillToken()
 {
-   Token t = Token.newToken(jjmatchedKind);
-   t.kind = jjmatchedKind;
+   final Token t;
+   final String curTokenImage;
+   final int beginLine;
+   final int endLine;
+   final int beginColumn;
+   final int endColumn;
    String im = jjstrLiteralImages[jjmatchedKind];
-   t.image = (im == null) ? input_stream.GetImage() : im;
-   t.beginLine = input_stream.getBeginLine();
-   t.beginColumn = input_stream.getBeginColumn();
-   t.endLine = input_stream.getEndLine();
-   t.endColumn = input_stream.getEndColumn();
+   curTokenImage = (im == null) ? input_stream.GetImage() : im;
+   beginLine = input_stream.getBeginLine();
+   beginColumn = input_stream.getBeginColumn();
+   endLine = input_stream.getEndLine();
+   endColumn = input_stream.getEndColumn();
+   t = Token.newToken(jjmatchedKind, curTokenImage);
+
+   t.beginLine = beginLine;
+   t.endLine = endLine;
+   t.beginColumn = beginColumn;
+   t.endColumn = endColumn;
+
    return t;
 }
 
@@ -324,11 +350,13 @@ int jjround;
 int jjmatchedPos;
 int jjmatchedKind;
 
+/** Get the next Token. */
 public Token getNextToken() 
 {
   Token matchedToken;
   int curPos = 0;
 
+  EOFLoop :
   for (;;)
   {   
    try   
@@ -341,7 +369,8 @@ public Token getNextToken() 
       matchedToken = jjFillToken();
       return matchedToken;
    }
-   image = null;
+   image = jjimage;
+   image.setLength(0);
    jjimageLen = 0;
 
    jjmatchedKind = 0x7fffffff;
@@ -387,15 +416,11 @@ void TokenLexicalActions(Token matchedTo
    switch(jjmatchedKind)
    {
       case 1 :
-        if (image == null)
-            image = new StringBuffer();
-            image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
+        image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
     matchedToken.image = matchedToken.image.toLowerCase();
          break;
       case 2 :
-        if (image == null)
-            image = new StringBuffer();
-            image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
+        image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
                                                   // remove dots
       for (int i = 0; i < image.length(); i++) {
         if (image.charAt(i) == '.')
@@ -407,4 +432,24 @@ void TokenLexicalActions(Token matchedTo
          break;
    }
 }
+private void jjCheckNAdd(int state)
+{
+   if (jjrounds[state] != jjround)
+   {
+      jjstateSet[jjnewStateCnt++] = state;
+      jjrounds[state] = jjround;
+   }
+}
+private void jjAddStates(int start, int end)
+{
+   do {
+      jjstateSet[jjnewStateCnt++] = jjnextStates[start];
+   } while (start++ != end);
+}
+private void jjCheckNAddTwoStates(int state1, int state2)
+{
+   jjCheckNAdd(state1);
+   jjCheckNAdd(state2);
+}
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -24,7 +24,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.hadoop.conf.Configuration;
 
 /**
@@ -71,21 +71,24 @@ public class NutchDocumentAnalyzer exten
   }
 
   private static class AnchorFilter extends TokenFilter {
+    private final PositionIncrementAttribute posAttr;
     private boolean first = true;
 
     public AnchorFilter(TokenStream input) {
       super(input);
+      // The super filter must have positional information.
+      posAttr = input.getAttribute(PositionIncrementAttribute.class);
     }
 
-    public final Token next() throws IOException {
-      Token result = input.next();
-      if (result == null)
-        return result;
-      if (first) {
-        result.setPositionIncrement(INTER_ANCHOR_GAP);
-        first = false;
+    public boolean incrementToken() throws IOException {
+      boolean hasNext = input.incrementToken(); 
+      if (hasNext) {
+        if (first) {
+          posAttr.setPositionIncrement(INTER_ANCHOR_GAP);
+          first = false;
+        }
       }
-      return result;
+      return false;
     }
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java Fri Mar 19 11:34:33 2010
@@ -19,8 +19,9 @@ package org.apache.nutch.analysis;
 
 import java.io.*;
 
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
 
 /** The tokenizer used for Nutch document text.  Implemented in terms of our
  * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared
@@ -28,18 +29,27 @@ import org.apache.lucene.analysis.Token;
  */
 public final class NutchDocumentTokenizer extends Tokenizer
   implements NutchAnalysisConstants {
-  
-  private NutchAnalysisTokenManager tokenManager;
+
+  private final NutchAnalysisTokenManager tokenManager;
+
+  private final TermAttribute termAtt;
+  private final PositionIncrementAttribute posIncrAtt;
+  private final TypeAttribute typeAtt;
+  private final OffsetAttribute offsetAtt;
 
   /** Construct a tokenizer for the text in a Reader. */
   public NutchDocumentTokenizer(Reader reader) {
     super(reader);
+
     tokenManager = new NutchAnalysisTokenManager(reader); 
+    this.termAtt = addAttribute(TermAttribute.class);
+    this.offsetAtt = addAttribute(OffsetAttribute.class);
+    this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+    this.typeAtt = addAttribute(TypeAttribute.class);
   }
-  
-  /** Returns the next token in the stream, or null at EOF. */
-  public final Token next() throws IOException {
 
+  /** Returns the next token in the stream, or null at EOF. */
+  private final Token next() throws IOException {
     org.apache.nutch.analysis.Token t;
 
     try {
@@ -64,6 +74,23 @@ public final class NutchDocumentTokenize
     }
   }
 
+  /** Lucene 3.0 API. */
+  public boolean incrementToken() throws IOException
+  {
+    clearAttributes();
+
+    final Token t = next();
+    if (t != null) {
+      termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+      offsetAtt.setOffset(t.startOffset(), t.endOffset());
+      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+      typeAtt.setType(t.type());
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   /** For debugging. */
   public static void main(String[] args) throws Exception {
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
@@ -71,14 +98,13 @@ public final class NutchDocumentTokenize
       System.out.print("Text: ");
       String line = in.readLine();
       Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
-      Token token;
+      TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
       System.out.print("Tokens: ");
-      while ((token = tokenizer.next()) != null) {
-        System.out.print(token.termText());
+      while (tokenizer.incrementToken()) {
+        System.out.print(termAtt.term());
         System.out.print(" ");
       }
       System.out.println();
     }
   }
-
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java Fri Mar 19 11:34:33 2010
@@ -1,4 +1,4 @@
-/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 3.0 */
+/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
 package org.apache.nutch.analysis;
 
 /**
@@ -11,7 +11,7 @@ package org.apache.nutch.analysis;
  * mechanisms so long as you retain the public fields.
  */
 @SuppressWarnings("serial")
-class ParseException extends java.io.IOException  {
+public class ParseException extends java.io.IOException {
 
   /**
    * This constructor is used by the method "generateParseException"
@@ -52,6 +52,7 @@ class ParseException extends java.io.IOE
     specialConstructor = false;
   }
 
+  /** Constructor with message. */
   public ParseException(String message) {
     super(message);
     specialConstructor = false;
@@ -99,19 +100,19 @@ class ParseException extends java.io.IOE
     if (!specialConstructor) {
       return super.getMessage();
     }
-    String expected = "";
+    StringBuffer expected = new StringBuffer();
     int maxSize = 0;
     for (int i = 0; i < expectedTokenSequences.length; i++) {
       if (maxSize < expectedTokenSequences[i].length) {
         maxSize = expectedTokenSequences[i].length;
       }
       for (int j = 0; j < expectedTokenSequences[i].length; j++) {
-        expected += tokenImage[expectedTokenSequences[i][j]] + " ";
+        expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
       }
       if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
-        expected += "...";
+        expected.append("...");
       }
-      expected += eol + "    ";
+      expected.append(eol).append("    ");
     }
     String retval = "Encountered \"";
     Token tok = currentToken.next;
@@ -121,7 +122,10 @@ class ParseException extends java.io.IOE
         retval += tokenImage[0];
         break;
       }
+      retval += " " + tokenImage[tok.kind];
+      retval += " \"";
       retval += add_escapes(tok.image);
+      retval += " \"";
       tok = tok.next; 
     }
     retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
@@ -131,7 +135,7 @@ class ParseException extends java.io.IOE
     } else {
       retval += "Was expecting one of:" + eol + "    ";
     }
-    retval += expected;
+    retval += expected.toString();
     return retval;
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java Fri Mar 19 11:34:33 2010
@@ -1,4 +1,4 @@
-/* Generated By:JavaCC: Do not edit this line. Token.java Version 3.0 */
+/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
 package org.apache.nutch.analysis;
 
 /**
@@ -14,12 +14,14 @@ class Token {
    */
   public int kind;
 
-  /**
-   * beginLine and beginColumn describe the position of the first character
-   * of this token; endLine and endColumn describe the position of the
-   * last character of this token.
-   */
-  public int beginLine, beginColumn, endLine, endColumn;
+  /** The line number of the first character of this Token. */
+  public int beginLine;
+  /** The column number of the first character of this Token. */
+  public int beginColumn;
+  /** The line number of the last character of this Token. */
+  public int endLine;
+  /** The column number of the last character of this Token. */
+  public int endColumn;
 
   /**
    * The string image of the token.
@@ -51,6 +53,40 @@ class Token {
   public Token specialToken;
 
   /**
+   * An optional attribute value of the Token.
+   * Tokens which are not used as syntactic sugar will often contain
+   * meaningful values that will be used later on by the compiler or
+   * interpreter. This attribute value is often different from the image.
+   * Any subclass of Token that actually wants to return a non-null value can
+   * override this method as appropriate.
+   */
+  public Object getValue() {
+    return null;
+  }
+
+  /**
+   * No-argument constructor
+   */
+  public Token() {}
+
+  /**
+   * Constructs a new token for the specified Image.
+   */
+  public Token(int kind)
+  {
+     this(kind, null);
+  }
+
+  /**
+   * Constructs a new token for the specified Image and Kind.
+   */
+  public Token(int kind, String image)
+  {
+     this.kind = kind;
+     this.image = image;
+  }
+
+  /**
    * Returns the image.
    */
   public String toString()
@@ -63,19 +99,25 @@ class Token {
    * can create and return subclass objects based on the value of ofKind.
    * Simply add the cases to the switch for all those special cases.
    * For example, if you have a subclass of Token called IDToken that
-   * you want to create if ofKind is ID, simlpy add something like :
+   * you want to create if ofKind is ID, simply add something like :
    *
-   *    case MyParserConstants.ID : return new IDToken();
+   *    case MyParserConstants.ID : return new IDToken(ofKind, image);
    *
    * to the following switch statement. Then you can cast matchedToken
-   * variable to the appropriate type and use it in your lexical actions.
+   * variable to the appropriate type and use sit in your lexical actions.
    */
-  public static final Token newToken(int ofKind)
+  public static Token newToken(int ofKind, String image)
   {
      switch(ofKind)
      {
-       default : return new Token();
+       default : return new Token(ofKind, image);
      }
   }
 
+  public static Token newToken(int ofKind)
+  {
+     return newToken(ofKind, null);
+  }
+
 }
+/* JavaCC - OriginalChecksum=6925860b4b6a41d42c759eab47d0d3a3 (do not edit this line) */

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Fri Mar 19 11:34:33 2010
@@ -379,7 +379,7 @@ public class DeleteDuplicates extends Co
                      OutputCollector<WritableComparable, Writable> output, Reporter reporter)
     throws IOException {
     Path index = new Path(key.toString());
-    IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
+    IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()), false);
     try {
       while (values.hasNext()) {
         IntWritable value = values.next();

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java Fri Mar 19 11:34:33 2010
@@ -64,7 +64,7 @@ public class FsDirectory extends Directo
     }
   }
 
-  public String[] list() throws IOException {
+  public String[] listAll() throws IOException {
     FileStatus[] fstats = fs.listStatus(directory, HadoopFSUtil.getPassAllFilter());
     Path[] files = HadoopFSUtil.getPaths(fstats);
     if (files == null) return null;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java Fri Mar 19 11:34:33 2010
@@ -17,11 +17,13 @@
 
 package org.apache.nutch.indexer;
 
+import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 
+import java.io.File;
 import java.io.OutputStreamWriter;
 
 /** Lists the most frequent terms in an index. */
@@ -37,14 +39,12 @@ public class HighFreqTerms {
     Term term;
   }
 
-  private static class TermFreqQueue extends PriorityQueue {
+  private static class TermFreqQueue extends PriorityQueue<TermFreq> {
     TermFreqQueue(int size) {
       initialize(size);
     }
 
-    protected final boolean lessThan(Object a, Object b) {
-      TermFreq termInfoA = (TermFreq)a;
-      TermFreq termInfoB = (TermFreq)b;
+    protected final boolean lessThan(TermFreq termInfoA, TermFreq termInfoB) {
       return termInfoA.docFreq < termInfoB.docFreq;
     }
   }
@@ -66,7 +66,7 @@ public class HighFreqTerms {
       } else if (args[i].equals("-nofreqs")) {    // found -nofreqs option
         noFreqs = true;
       } else {
-        reader = IndexReader.open(args[i]);
+        reader = IndexReader.open(FSDirectory.open(new File(args[i])));
       }
     }
 
@@ -76,10 +76,10 @@ public class HighFreqTerms {
     int minFreq = 0;
     while (terms.next()) {
       if (terms.docFreq() > minFreq) {
-        tiq.put(new TermFreq(terms.term(), terms.docFreq()));
+        TermFreq top = tiq.add(new TermFreq(terms.term(), terms.docFreq()));
         if (tiq.size() >= count) {                 // if tiq overfull
           tiq.pop();                              // remove lowest in tiq
-          minFreq = ((TermFreq)tiq.top()).docFreq; // reset minFreq
+          minFreq = top.docFreq; // reset minFreq
         }
       }
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Fri Mar 19 11:34:33 2010
@@ -33,7 +33,10 @@ import org.apache.nutch.util.LogUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
 
 /*************************************************************************
  * IndexMerger creates an index for the output corresponding to a 
@@ -86,15 +89,18 @@ public class IndexMerger extends Configu
     //
     // Merge indices
     //
-    IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
-    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+    IndexWriter writer = new IndexWriter(
+    		FSDirectory.open(new File(localOutput.toString())), null, true,
+    				MaxFieldLength.UNLIMITED);
+    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR));
     writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
-    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS));
     writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
     writer.setInfoStream(LogUtil.getDebugStream(LOG));
     writer.setUseCompoundFile(false);
     writer.setSimilarity(new NutchSimilarity());
-    writer.addIndexes(dirs);
+    writer.addIndexesNoOptimize(dirs);
+    writer.optimize();
     writer.close();
 
     //

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.util.Date;
 import java.util.Arrays;
 
 import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
 import org.apache.lucene.document.*;
 import org.apache.lucene.store.*;
 import org.apache.lucene.search.*;
@@ -188,7 +189,7 @@ public class IndexSorter extends Configu
     }
 
     public Document document(int n) throws IOException {
-      return super.document(newToOld[n]);
+      return document(n, null);
     }
 
     public Document document(int n, FieldSelector fieldSelector)
@@ -263,11 +264,13 @@ public class IndexSorter extends Configu
     LOG.info("IndexSorter: starting.");
     Date start = new Date();
     int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128);
-    IndexReader reader = IndexReader.open(new File(directory, "index"));
+    IndexReader reader = IndexReader.open(
+    		FSDirectory.open(new File(directory, "index")));
 
     SortingReader sorter = new SortingReader(reader, oldToNew(reader));
-    IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"),
-                                         null, true);
+    IndexWriter writer = new IndexWriter(
+    		FSDirectory.open(new File(directory, "index-sorted")),
+    			null, true, MaxFieldLength.UNLIMITED);
     writer.setTermIndexInterval
       (termIndexInterval);
     writer.setUseCompoundFile(false);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java Fri Mar 19 11:34:33 2010
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.field;
 
 import java.io.DataInput;
 import java.io.DataOutput;
+import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -57,6 +58,8 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.nutch.analysis.AnalyzerFactory;
 import org.apache.nutch.analysis.NutchAnalyzer;
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
@@ -115,8 +118,10 @@ public class FieldIndexer
 
       final AnalyzerFactory factory = new AnalyzerFactory(job);
       final IndexWriter writer = // build locally first
-      new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
-        new NutchDocumentAnalyzer(job), true);
+      new IndexWriter(
+        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
+        new NutchDocumentAnalyzer(job), true, 
+        new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
 
       writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
       writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java Fri Mar 19 11:34:33 2010
@@ -35,10 +35,13 @@ public interface LuceneConstants {
 
   public static final String INDEX_NO = "index.no";
 
+  // TODO: -> ANALYZED_NO_NORMS
   public static final String INDEX_NO_NORMS = "index.no_norms";
 
+  // TODO: -> ANALYZED
   public static final String INDEX_TOKENIZED = "index.tokenized";
 
+  // TODO: -> NOT_ANALYZED
   public static final String INDEX_UNTOKENIZED = "index.untokenized";
 
   public static final String VECTOR_NO = "vector.no";

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java Fri Mar 19 11:34:33 2010
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.indexer.lucene;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -32,6 +33,8 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.nutch.analysis.AnalyzerFactory;
 import org.apache.nutch.analysis.NutchAnalyzer;
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
@@ -108,13 +111,13 @@ public class LuceneWriter implements Nut
           } else if (LuceneConstants.STORE_NO.equals(val)) {
             store = Field.Store.NO;
           } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) {
-            index = Field.Index.TOKENIZED;
+            index = Field.Index.ANALYZED;
           } else if (LuceneConstants.INDEX_NO.equals(val)) {
             index = Field.Index.NO;
           } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) {
-            index = Field.Index.UN_TOKENIZED;
+            index = Field.Index.NOT_ANALYZED;
           } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) {
-            index = Field.Index.NO_NORMS;
+            index = Field.Index.ANALYZED_NO_NORMS;
           } else if (LuceneConstants.VECTOR_NO.equals(val)) {
             vector = Field.TermVector.NO;
           } else if (LuceneConstants.VECTOR_YES.equals(val)) {
@@ -151,14 +154,12 @@ public class LuceneWriter implements Nut
         final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key));
         switch (store) {
         case YES:
+        case COMPRESS:
           fieldStore.put(field, Field.Store.YES);
           break;
         case NO:
           fieldStore.put(field, Field.Store.NO);
           break;
-        case COMPRESS:
-          fieldStore.put(field, Field.Store.COMPRESS);
-          break;
         }
       } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) {
         final String field =
@@ -169,13 +170,13 @@ public class LuceneWriter implements Nut
           fieldIndex.put(field, Field.Index.NO);
           break;
         case NO_NORMS:
-          fieldIndex.put(field, Field.Index.NO_NORMS);
+          fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS);
           break;
         case TOKENIZED:
-          fieldIndex.put(field, Field.Index.TOKENIZED);
+          fieldIndex.put(field, Field.Index.ANALYZED);
           break;
         case UNTOKENIZED:
-          fieldIndex.put(field, Field.Index.UN_TOKENIZED);
+          fieldIndex.put(field, Field.Index.NOT_ANALYZED);
           break;
         }
       } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) {
@@ -212,8 +213,9 @@ public class LuceneWriter implements Nut
 
     fs.delete(perm, true); // delete old, if any
     analyzerFactory = new AnalyzerFactory(job);
-    writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
-        new NutchDocumentAnalyzer(job), true);
+    writer = new IndexWriter(
+        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
+        new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);
 
     writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
     writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
@@ -266,8 +268,6 @@ public class LuceneWriter implements Nut
     final Metadata documentMeta = doc.getDocumentMeta();
     if (f.isStored()) {
       documentMeta.add(key, LuceneConstants.STORE_YES);
-    } else if (f.isCompressed()) {
-      documentMeta.add(key, LuceneConstants.STORE_COMPRESS);
     } else {
       documentMeta.add(key, LuceneConstants.STORE_NO);
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Fri Mar 19 11:34:33 2010
@@ -107,4 +107,4 @@ public class MetaWrapper extends NutchWr
     super.write(out);
     metadata.write(out);
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Fri Mar 19 11:34:33 2010
@@ -120,4 +120,4 @@ public class LoopReader {
     }
   }
 
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Fri Mar 19 11:34:33 2010
@@ -603,4 +603,4 @@ public class Loops
       return -2;
     }
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Fri Mar 19 11:34:33 2010
@@ -119,4 +119,4 @@ public class NodeReader {
     }
   }
 
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Fri Mar 19 11:34:33 2010
@@ -105,4 +105,4 @@ public class DistributedSearch {
       server.join();
     }
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Fri Mar 19 11:34:33 2010
@@ -30,6 +30,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.search.FieldCache;
@@ -83,7 +84,7 @@ public class IndexSearcher implements Se
     if ("file".equals(this.fs.getUri().getScheme())) {
       Path qualified = file.makeQualified(FileSystem.getLocal(conf));
       File fsLocal = new File(qualified.toUri());
-      return FSDirectory.getDirectory(fsLocal.getAbsolutePath());
+      return FSDirectory.open(new File(fsLocal.getAbsolutePath()));
     } else {
       return new FsDirectory(this.fs, file, false, this.conf);
     }
@@ -120,11 +121,11 @@ public class IndexSearcher implements Se
 
     Document doc = luceneSearcher.doc(Integer.valueOf(hit.getUniqueKey()));
 
-    List docFields = doc.getFields();
+    List<Fieldable> docFields = doc.getFields();
     String[] fields = new String[docFields.size()];
     String[] values = new String[docFields.size()];
     for (int i = 0; i < docFields.size(); i++) {
-      Field field = (Field)docFields.get(i);
+      Fieldable field = docFields.get(i);
       fields[i] = field.name();
       values[i] = field.stringValue();
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Fri Mar 19 11:34:33 2010
@@ -17,18 +17,14 @@
 
 package org.apache.nutch.searcher;
 
-import org.apache.lucene.search.Searcher;
-import org.apache.lucene.search.*;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.misc.ChainedFilter;
+import java.io.IOException;
+import java.util.*;
 
 import org.apache.hadoop.conf.Configuration;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.ArrayList;
-
-import java.io.IOException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.misc.ChainedFilter;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.Searcher;
 
 /** Utility which converts certain query clauses into {@link QueryFilter}s and
  * caches these.  Only required clauses whose boost is zero are converted to
@@ -93,16 +89,18 @@ class LuceneQueryOptimizer {
     }
   }
 
-  private static class LimitedCollector extends TopDocCollector {
+  private static class LimitedCollector extends Collector {
     private int maxHits;
     private int maxTicks;
     private int startTicks;
     private TimerThread timer;
     private int curTicks;
+    private TopDocsCollector<ScoreDoc> delegate;
 
     public LimitedCollector(int numHits, int maxHits, int maxTicks,
             TimerThread timer) {
-      super(numHits);
+      final boolean docsScoredInOrder = true;
+      delegate = TopScoreDocCollector.create(numHits, docsScoredInOrder);
       this.maxHits = maxHits;
       this.maxTicks = maxTicks;
       if (timer != null) {
@@ -111,8 +109,14 @@ class LuceneQueryOptimizer {
       }
     }
 
-    public void collect(int doc, float score) {
-      if (maxHits > 0 && getTotalHits() >= maxHits) {
+    @Override
+    public boolean acceptsDocsOutOfOrder() {
+      return delegate.acceptsDocsOutOfOrder();
+    }
+
+    @Override
+    public void collect(int doc) throws IOException {
+      if (maxHits > 0 && delegate.getTotalHits() >= maxHits) {
         throw new LimitExceeded(doc);
       }
       if (timer != null) {
@@ -123,7 +127,22 @@ class LuceneQueryOptimizer {
           throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc);
         }
       }
-      super.collect(doc, score);
+      delegate.collect(doc);
+    }
+
+    @Override
+    public void setNextReader(IndexReader r, int base)
+        throws IOException {
+      delegate.setNextReader(r, base);
+    }
+
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {
+      delegate.setScorer(scorer);
+    }
+
+    public TopDocs topDocs() {
+      return delegate.topDocs();
     }
   }
   
@@ -193,15 +212,11 @@ public LuceneQueryOptimizer(Configuratio
           continue;
         }
           
-        if (c.getQuery() instanceof RangeQuery) { // RangeQuery
-          RangeQuery range = (RangeQuery)c.getQuery();
-          boolean inclusive = range.isInclusive();// convert to RangeFilter
-          Term lower = range.getLowerTerm();
-          Term upper = range.getUpperTerm();
-          filters.add(new RangeFilter(lower!=null?lower.field():upper.field(),
-                                      lower != null ? lower.text() : null,
-                                      upper != null ? upper.text() : null,
-                                      inclusive, inclusive));
+        if (c.getQuery() instanceof TermRangeQuery) { // RangeQuery
+          TermRangeQuery range = (TermRangeQuery)c.getQuery();
+          filters.add(new TermRangeFilter(range.getField(), 
+              range.getLowerTerm(), range.getUpperTerm(), 
+              range.includesLower(), range.includesUpper()));
           cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
           continue;
         }
@@ -271,7 +286,7 @@ public LuceneQueryOptimizer(Configuratio
 
     } else {
       return searcher.search(query, filter, numHits,
-                             new Sort(sortField, reverse));
+                             new Sort(new SortField(sortField, SortField.STRING, reverse)));
     }
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Fri Mar 19 11:34:33 2010
@@ -378,7 +378,7 @@ HitInlinks, Closeable {
 
   /** For debugging. */
   public static void main(String[] args) throws Exception {
-    final String usage = "NutchBean query";
+    final String usage = "NutchBean query [<searcher.dir>]";
 
     if (args.length == 0) {
       System.err.println(usage);
@@ -386,6 +386,9 @@ HitInlinks, Closeable {
     }
 
     final Configuration conf = NutchConfiguration.create();
+    if (args.length > 1) {
+      conf.set("searcher.dir", args[1]);
+    }
     final NutchBean bean = new NutchBean(conf);
     try {
       final Query query = Query.parse(args[0], conf);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java Fri Mar 19 11:34:33 2010
@@ -103,4 +103,4 @@ public class SegmentPart {
     String part = string.substring(idx + 1);
     return new SegmentPart(segment, part);
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java Fri Mar 19 11:34:33 2010
@@ -22,38 +22,21 @@
 
 package org.apache.nutch.tools;
 
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileFilter;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.util.BitSet;
-import java.util.StringTokenizer;
-import java.util.Vector;
+import java.io.*;
+import java.util.*;
 
-// Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.*;
 import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.HitCollector;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.nutch.util.NutchConfiguration;
 
 /**
  * This tool prunes existing Nutch indexes of unwanted content. The main method
@@ -253,13 +236,13 @@ public class PruneIndexTool implements R
     if (dryrun) dr = "[DRY RUN] ";
     int numIdx = 0;
     if (indexDirs.length == 1) {
-      Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
-      if (IndexReader.isLocked(dir)) {
+      Directory dir = FSDirectory.open(indexDirs[0]);
+      if (IndexWriter.isLocked(dir)) {
         if (!unlock) {
           throw new Exception("Index " + indexDirs[0] + " is locked.");
         }
         if (!dryrun) {
-          IndexReader.unlock(dir);
+          IndexWriter.unlock(dir);
           if (LOG.isDebugEnabled()) {
             LOG.debug(" - had to unlock index in " + dir);
           }
@@ -272,8 +255,8 @@ public class PruneIndexTool implements R
       Vector<IndexReader> indexes = new Vector<IndexReader>(indexDirs.length);
       for (int i = 0; i < indexDirs.length; i++) {
         try {
-          dir = FSDirectory.getDirectory(indexDirs[i], false);
-          if (IndexReader.isLocked(dir)) {
+          dir = FSDirectory.open(indexDirs[i]);
+          if (IndexWriter.isLocked(dir)) {
             if (!unlock) {
               if (LOG.isWarnEnabled()) {
                 LOG.warn(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
@@ -281,7 +264,7 @@ public class PruneIndexTool implements R
               continue;
             }
             if (!dryrun) {
-              IndexReader.unlock(dir);
+              IndexWriter.unlock(dir);
               if (LOG.isDebugEnabled()) {
                 LOG.debug(" - had to unlock index in " + dir);
               }
@@ -315,15 +298,31 @@ public class PruneIndexTool implements R
    * 
    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
    */
-  private static class AllHitsCollector extends HitCollector {
+  private static class AllHitsCollector extends Collector {
     private BitSet bits;
     
     public AllHitsCollector(BitSet bits) {
       this.bits = bits;
     }
-    public void collect(int doc, float score) {
+
+    public void collect(int doc) {
       bits.set(doc);
     }
+
+    @Override
+    public boolean acceptsDocsOutOfOrder() {
+      return false;
+    }
+
+    @Override
+    public void setNextReader(IndexReader paramIndexReader, int paramInt) throws IOException {
+      // Do nothing.
+    }
+
+    @Override
+    public void setScorer(Scorer paramScorer) throws IOException {
+      // Do nothing.
+    }
   }
   
   /**
@@ -415,7 +414,7 @@ public class PruneIndexTool implements R
       return;
     }
     Vector<File> paths = new Vector<File>();
-    if (IndexReader.indexExists(idx)) {
+    if (IndexReader.indexExists(FSDirectory.open(idx))) {
       paths.add(idx);
     } else {
       // try and see if there are segments inside, with index dirs
@@ -431,7 +430,8 @@ public class PruneIndexTool implements R
       }
       for (int i = 0; i < dirs.length; i++) {
         File sidx = new File(dirs[i], "index");
-        if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
+        if (sidx.exists() && sidx.isDirectory() 
+            && IndexReader.indexExists(FSDirectory.open(sidx))) {
           paths.add(sidx);
         }
       }
@@ -534,7 +534,7 @@ public class PruneIndexTool implements R
   public static Query[] parseQueries(InputStream is) throws Exception {
     BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
     String line = null;
-    QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
+    QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "url", new WhitespaceAnalyzer());
     Vector<Query> queries = new Vector<Query>();
     while ((line = br.readLine()) != null) {
       line = line.trim();

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java Fri Mar 19 11:34:33 2010
@@ -309,4 +309,4 @@ public class ReprUrlFixer
       return -1;
     }
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.io.Reader;
 // Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
 
 // Nutch imports
 import org.apache.nutch.analysis.NutchAnalyzer;
@@ -35,7 +36,7 @@ import org.apache.nutch.analysis.NutchAn
 public class GermanAnalyzer extends NutchAnalyzer {
     
     private final static Analyzer ANALYZER = 
-            new org.apache.lucene.analysis.de.GermanAnalyzer();
+            new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_CURRENT);
 
     
     /** Creates a new instance of FrenchAnalyzer */

Modified: lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.io.Reader;
 // Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
 
 // Nutch imports
 import org.apache.nutch.analysis.NutchAnalyzer;
@@ -35,7 +36,7 @@ import org.apache.nutch.analysis.NutchAn
 public class FrenchAnalyzer extends NutchAnalyzer {
     
     private final static Analyzer ANALYZER = 
-            new org.apache.lucene.analysis.fr.FrenchAnalyzer();
+            new org.apache.lucene.analysis.fr.FrenchAnalyzer(Version.LUCENE_CURRENT);
 
     
     /** Creates a new instance of FrenchAnalyzer */