You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/03/19 12:34:34 UTC
svn commit: r925179 [1/2] - in /lucene/nutch/trunk: ./ lib/
src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/indexer/field/
src/java/org/apache/nutch/indexer/lucene/
src/java/org/apache/nutch/metadata/ sr...
Author: ab
Date: Fri Mar 19 11:34:33 2010
New Revision: 925179
URL: http://svn.apache.org/viewvc?rev=925179&view=rev
Log:
NUTCH-787 Upgrade to Lucene 3.0.1.
Added:
lucene/nutch/trunk/lib/lucene-core-3.0.1.jar (with props)
lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar (with props)
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar (with props)
Removed:
lucene/nutch/trunk/lib/lucene-core-2.9.1.jar
lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar 19 11:34:33 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Unreleased Changes
+* NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab)
+
* NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging (ab)
* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche)
Added: lucene/nutch/trunk/lib/lucene-core-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/lucene-core-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/lucene-misc-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Fri Mar 19 11:34:33 2010
@@ -17,23 +17,21 @@
package org.apache.nutch.analysis;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
-
import java.io.*;
import java.util.*;
-// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.conf.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.nutch.searcher.Query.Phrase;
+import org.apache.nutch.searcher.Query.Term;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
-import org.apache.nutch.searcher.Query.*;
-/** Construct n-grams for frequently occuring terms and phrases while indexing.
+/** Construct n-grams for frequently occurring terms and phrases while indexing.
* Optimize phrase queries to use the n-grams. Single terms are still indexed
* too, with n-grams overlaid. This is achieved through the use of {@link
* Token#setPositionIncrement(int)}.*/
@@ -61,10 +59,44 @@ public class CommonGrams {
private LinkedList<Token> nextQueue = new LinkedList<Token>();
private StringBuffer buffer = new StringBuffer();
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+ private final TypeAttribute typeAtt;
+ private final OffsetAttribute offsetAtt;
+
/** Construct an n-gram producing filter. */
public Filter(TokenStream input, HashSet<String> common) {
super(input);
this.common = common;
+ this.termAtt = getAttribute(TermAttribute.class);
+ this.offsetAtt = getAttribute(OffsetAttribute.class);
+ this.posIncrAtt = getAttribute(PositionIncrementAttribute.class);
+ this.typeAtt = addAttribute(TypeAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ Token t = next();
+ if (t != null) {
+ termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+ offsetAtt.setOffset(t.startOffset(), t.endOffset());
+ posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+ typeAtt.setType(t.type());
+ }
+ return t != null;
+ }
+
+ private Token inputNext() throws IOException {
+ if (super.input.incrementToken()) {
+ Token t = new Token(
+ termAtt.termBuffer(), 0, termAtt.termLength(),
+ offsetAtt.startOffset(), offsetAtt.endOffset());
+ t.setPositionIncrement(posIncrAtt.getPositionIncrement());
+ t.setType(typeAtt.type());
+ return t;
+ }
+ return null;
}
/** Inserts n-grams into a token stream. */
@@ -103,7 +135,7 @@ public class CommonGrams {
/** True iff token is for a common term. */
private boolean isCommon(Token token) {
- return common != null && common.contains(token.termText());
+ return common != null && common.contains(token.term());
}
/** Pops nextQueue or, if empty, reads a new token. */
@@ -111,13 +143,13 @@ public class CommonGrams {
if (nextQueue.size() > 0)
return nextQueue.removeFirst();
else
- return input.next();
+ return inputNext();
}
/** Return next token in nextQueue, extending it when empty. */
private Token peekNext(ListIterator<Token> i) throws IOException {
if (!i.hasNext()) {
- Token next = input.next();
+ Token next = inputNext();
if (next == null)
return null;
i.add(next);
@@ -129,9 +161,9 @@ public class CommonGrams {
/** Construct a compound token. */
private Token gramToken(Token first, Token second) {
buffer.setLength(0);
- buffer.append(first.termText());
+ buffer.append(first.term());
buffer.append(SEPARATOR);
- buffer.append(second.termText());
+ buffer.append(second.term());
Token result = new Token(buffer.toString(),
first.startOffset(), second.endOffset(),
"gram");
@@ -159,24 +191,23 @@ public class CommonGrams {
if (line.startsWith("#") || "".equals(line)) // skip comments
continue;
TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
- Token token = ts.next();
- if (token == null) {
+ TermAttribute ta = ts.getAttribute(TermAttribute.class);
+ if (!ts.incrementToken()) {
if (LOG.isWarnEnabled()) {
LOG.warn("Line does not contain a field name: " + line);
}
continue;
}
- String field = token.termText();
- token = ts.next();
- if (token == null) {
+ String field = ta.term();
+ if (!ts.incrementToken()) {
if (LOG.isWarnEnabled()) {
LOG.warn("Line contains only a field name, no word: " + line);
}
continue;
}
- String gram = token.termText();
- while ((token = ts.next()) != null) {
- gram = gram + SEPARATOR + token.termText();
+ String gram = ta.term();
+ while (ts.incrementToken()) {
+ gram = gram + SEPARATOR + ta.term();
}
HashSet<String> table = commonTerms.get(field);
if (table == null) {
@@ -201,16 +232,27 @@ public class CommonGrams {
private static class ArrayTokens extends TokenStream {
private Term[] terms;
private int index;
+ private final TermAttribute termAttr;
+ private final PositionIncrementAttribute posAttr;
+ private final OffsetAttribute offsetAttr;
public ArrayTokens(Phrase phrase) {
this.terms = phrase.getTerms();
+ this.termAttr = addAttribute(TermAttribute.class);
+ this.posAttr = addAttribute(PositionIncrementAttribute.class);
+ this.offsetAttr = addAttribute(OffsetAttribute.class);
}
- public Token next() {
+ @Override
+ public boolean incrementToken() throws IOException {
if (index == terms.length)
- return null;
- else
- return new Token(terms[index].toString(), index, ++index);
+ return false;
+
+ clearAttributes();
+ termAttr.setTermBuffer(terms[index].toString());
+ posAttr.setPositionIncrement(1);
+ offsetAttr.setOffset(index, ++index);
+ return true;
}
}
@@ -222,22 +264,24 @@ public class CommonGrams {
}
ArrayList<String> result = new ArrayList<String>();
TokenStream ts = getFilter(new ArrayTokens(phrase), field);
- Token token, prev=null;
+ String prev = null;
+ TermAttribute ta = ts.getAttribute(TermAttribute.class);
+ PositionIncrementAttribute pa = ts.getAttribute(PositionIncrementAttribute.class);
int position = 0;
try {
- while ((token = ts.next()) != null) {
- if (token.getPositionIncrement() != 0 && prev != null)
- result.add(prev.termText());
- prev = token;
- position += token.getPositionIncrement();
- if ((position + arity(token.termText())) == phrase.getTerms().length)
+ while (ts.incrementToken()) {
+ if (pa.getPositionIncrement() != 0 && prev != null)
+ result.add(prev);
+ prev = ta.term();
+ position += pa.getPositionIncrement();
+ if ((position + arity(ta.term())) == phrase.getTerms().length)
break;
}
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
if (prev != null)
- result.add(prev.termText());
+ result.add(prev);
return result.toArray(new String[result.size()]);
}
@@ -261,9 +305,12 @@ public class CommonGrams {
TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
ts = commonGrams.getFilter(ts, "url");
- Token token;
- while ((token = ts.next()) != null) {
- System.out.println("Token: " + token);
+ TermAttribute ta = ts.getAttribute(TermAttribute.class);
+ OffsetAttribute oa = ts.getAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute pia = ts.getAttribute(PositionIncrementAttribute.class);
+ while (ts.incrementToken()) {
+ System.out.println("Token: " + ta.term() + " offs:" + oa.startOffset() + "-" + oa.endOffset()
+ + " incr: " + pia.getPositionIncrement());
}
String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
System.out.print("Optimized: ");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java Fri Mar 19 11:34:33 2010
@@ -10,6 +10,7 @@ import org.apache.nutch.util.NutchConfig
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.*;
import java.util.*;
@@ -72,7 +73,7 @@ public class NutchAnalysis implements Nu
/** Parse a query. */
final public Query parse(Configuration conf) throws ParseException {
Query query = new Query(conf);
- ArrayList<String> terms;
+ ArrayList terms;
Token token;
String field;
boolean stop;
@@ -140,7 +141,7 @@ public class NutchAnalysis implements Nu
throw new ParseException();
}
nonOpOrTerm();
- String[] array = terms.toArray(new String[terms.size()]);
+ String[] array = (String[])terms.toArray(new String[terms.size()]);
if (stop
&& field == Clause.DEFAULT_FIELD
@@ -160,10 +161,10 @@ public class NutchAnalysis implements Nu
/** Parse an explcitly quoted phrase query. Note that this may return a single
* term, a trivial phrase.*/
- final public ArrayList<String> phrase(String field) throws ParseException {
+ final public ArrayList phrase(String field) throws ParseException {
int start;
int end;
- ArrayList<String> result = new ArrayList<String>();
+ ArrayList result = new ArrayList();
String term;
jj_consume_token(QUOTE);
start = token.endColumn;
@@ -243,10 +244,10 @@ public class NutchAnalysis implements Nu
/** Parse a compound term that is interpreted as an implicit phrase query.
* Compounds are a sequence of terms separated by infix characters. Note that
- * htis may return a single term, a trivial compound. */
- final public ArrayList<String> compound(String field) throws ParseException {
+ * this may return a single term, a trivial compound. */
+ final public ArrayList compound(String field) throws ParseException {
int start;
- ArrayList<String> result = new ArrayList<String>();
+ ArrayList result = new ArrayList();
String term;
StringBuffer terms = new StringBuffer();
start = token.endColumn;
@@ -289,19 +290,23 @@ public class NutchAnalysis implements Nu
result.add(queryString.substring(start, token.endColumn));
} else {
- org.apache.lucene.analysis.Token token;
TokenStream tokens = analyzer.tokenStream(
field, new StringReader(terms.toString()));
- while (true) {
- try {
- token = tokens.next();
- } catch (IOException e) {
- token = null;
+ TermAttribute ta = tokens.getAttribute(TermAttribute.class);
+ try
+ {
+ String termText;
+ while (tokens.incrementToken())
+ {
+ if ((termText = ta.term()) == null)
+ break;
+ result.add(termText);
}
- if (token == null) { break; }
- result.add(token.termText());
+ } catch (IOException e) {
+ // ignore (?)
}
+//
try {
tokens.close();
} catch (IOException e) {
@@ -470,76 +475,77 @@ public class NutchAnalysis implements Nu
}
}
- final private boolean jj_2_1(int xla) {
+ private boolean jj_2_1(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_1(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(0, xla); }
}
- final private boolean jj_2_2(int xla) {
+ private boolean jj_2_2(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_2(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(1, xla); }
}
- final private boolean jj_2_3(int xla) {
+ private boolean jj_2_3(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_3(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(2, xla); }
}
- final private boolean jj_3_1() {
- if (jj_scan_token(WORD)) return true;
- if (jj_scan_token(COLON)) return true;
+ private boolean jj_3_3() {
Token xsp;
xsp = jj_scanpos;
- if (jj_3R_8()) {
+ if (jj_scan_token(15)) {
jj_scanpos = xsp;
- if (jj_3R_9()) return true;
+ if (jj_3R_12()) {
+ jj_scanpos = xsp;
+ if (jj_3R_13()) return true;
}
+ }
+ return false;
+ }
+
+ private boolean jj_3R_27() {
+ if (jj_3R_16()) return true;
return false;
}
- final private boolean jj_3R_16() {
+ private boolean jj_3R_25() {
+ if (jj_3R_24()) return true;
+ return false;
+ }
+
+ private boolean jj_3R_23() {
+ if (jj_3R_24()) return true;
+ return false;
+ }
+
+ private boolean jj_3R_18() {
Token xsp;
xsp = jj_scanpos;
- if (jj_scan_token(7)) {
- jj_scanpos = xsp;
- if (jj_scan_token(8)) {
+ if (jj_3R_23()) {
jj_scanpos = xsp;
- if (jj_3R_22()) return true;
- }
+ if (jj_scan_token(0)) return true;
}
return false;
}
- final private boolean jj_3_3() {
+ private boolean jj_3R_13() {
Token xsp;
xsp = jj_scanpos;
- if (jj_scan_token(15)) {
- jj_scanpos = xsp;
- if (jj_3R_12()) {
+ if (jj_scan_token(7)) {
jj_scanpos = xsp;
- if (jj_3R_13()) return true;
- }
+ if (jj_scan_token(8)) return true;
}
+ if (jj_3R_18()) return true;
return false;
}
- final private boolean jj_3R_25() {
- if (jj_3R_24()) return true;
- return false;
- }
-
- final private boolean jj_3R_27() {
- if (jj_3R_16()) return true;
- return false;
- }
-
- final private boolean jj_3R_20() {
+ private boolean jj_3R_20() {
if (jj_3R_11()) return true;
Token xsp;
while (true) {
@@ -549,17 +555,17 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_10() {
+ private boolean jj_3R_10() {
if (jj_3R_16()) return true;
return false;
}
- final private boolean jj_3R_19() {
+ private boolean jj_3R_19() {
if (jj_3R_24()) return true;
return false;
}
- final private boolean jj_3_2() {
+ private boolean jj_3_2() {
Token xsp;
if (jj_3R_10()) return true;
while (true) {
@@ -570,38 +576,22 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_23() {
- if (jj_3R_24()) return true;
- return false;
- }
-
- final private boolean jj_3R_18() {
- Token xsp;
- xsp = jj_scanpos;
- if (jj_3R_23()) {
- jj_scanpos = xsp;
- if (jj_scan_token(0)) return true;
- }
+ private boolean jj_3R_9() {
+ if (jj_3R_15()) return true;
return false;
}
- final private boolean jj_3R_13() {
+ private boolean jj_3R_24() {
Token xsp;
xsp = jj_scanpos;
- if (jj_scan_token(7)) {
+ if (jj_scan_token(15)) {
jj_scanpos = xsp;
- if (jj_scan_token(8)) return true;
+ if (jj_3R_27()) return true;
}
- if (jj_3R_18()) return true;
- return false;
- }
-
- final private boolean jj_3R_9() {
- if (jj_3R_15()) return true;
return false;
}
- final private boolean jj_3R_14() {
+ private boolean jj_3R_14() {
if (jj_scan_token(QUOTE)) return true;
Token xsp;
while (true) {
@@ -620,22 +610,17 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_24() {
- Token xsp;
- xsp = jj_scanpos;
- if (jj_scan_token(15)) {
- jj_scanpos = xsp;
- if (jj_3R_27()) return true;
- }
+ private boolean jj_3R_26() {
+ if (jj_3R_16()) return true;
return false;
}
- final private boolean jj_3R_26() {
- if (jj_3R_16()) return true;
+ private boolean jj_3R_22() {
+ if (jj_3R_17()) return true;
return false;
}
- final private boolean jj_3R_21() {
+ private boolean jj_3R_21() {
Token xsp;
if (jj_3R_26()) return true;
while (true) {
@@ -646,22 +631,12 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_22() {
- if (jj_3R_17()) return true;
- return false;
- }
-
- final private boolean jj_3R_8() {
- if (jj_3R_14()) return true;
- return false;
- }
-
- final private boolean jj_3R_12() {
+ private boolean jj_3R_12() {
if (jj_3R_17()) return true;
return false;
}
- final private boolean jj_3R_11() {
+ private boolean jj_3R_11() {
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(1)) {
@@ -674,7 +649,12 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_15() {
+ private boolean jj_3R_8() {
+ if (jj_3R_14()) return true;
+ return false;
+ }
+
+ private boolean jj_3R_15() {
if (jj_3R_11()) return true;
Token xsp;
while (true) {
@@ -684,7 +664,7 @@ public class NutchAnalysis implements Nu
return false;
}
- final private boolean jj_3R_17() {
+ private boolean jj_3R_17() {
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(10)) {
@@ -703,25 +683,54 @@ public class NutchAnalysis implements Nu
return false;
}
+ private boolean jj_3_1() {
+ if (jj_scan_token(WORD)) return true;
+ if (jj_scan_token(COLON)) return true;
+ Token xsp;
+ xsp = jj_scanpos;
+ if (jj_3R_8()) {
+ jj_scanpos = xsp;
+ if (jj_3R_9()) return true;
+ }
+ return false;
+ }
+
+ private boolean jj_3R_16() {
+ Token xsp;
+ xsp = jj_scanpos;
+ if (jj_scan_token(7)) {
+ jj_scanpos = xsp;
+ if (jj_scan_token(8)) {
+ jj_scanpos = xsp;
+ if (jj_3R_22()) return true;
+ }
+ }
+ return false;
+ }
+
+ /** Generated Token Manager. */
public NutchAnalysisTokenManager token_source;
- public Token token, jj_nt;
+ /** Current token. */
+ public Token token;
+ /** Next token. */
+ public Token jj_nt;
private int jj_ntk;
private Token jj_scanpos, jj_lastpos;
private int jj_la;
- public boolean lookingAhead = false;
private int jj_gen;
final private int[] jj_la1 = new int[16];
static private int[] jj_la1_0;
static {
- jj_la1_0();
+ jj_la1_init_0();
}
- private static void jj_la1_0() {
+ private static void jj_la1_init_0() {
jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[3];
private boolean jj_rescan = false;
private int jj_gc = 0;
+ /** Constructor with user supplied CharStream. */
public NutchAnalysis(CharStream stream) {
token_source = new NutchAnalysisTokenManager(stream);
token = new Token();
@@ -731,6 +740,7 @@ public class NutchAnalysis implements Nu
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
+ /** Reinitialise. */
public void ReInit(CharStream stream) {
token_source.ReInit(stream);
token = new Token();
@@ -740,6 +750,7 @@ public class NutchAnalysis implements Nu
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
+ /** Constructor with generated Token Manager. */
public NutchAnalysis(NutchAnalysisTokenManager tm) {
token_source = tm;
token = new Token();
@@ -749,6 +760,7 @@ public class NutchAnalysis implements Nu
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
+ /** Reinitialise. */
public void ReInit(NutchAnalysisTokenManager tm) {
token_source = tm;
token = new Token();
@@ -758,7 +770,7 @@ public class NutchAnalysis implements Nu
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
- final private Token jj_consume_token(int kind) throws ParseException {
+ private Token jj_consume_token(int kind) throws ParseException {
Token oldToken;
if ((oldToken = token).next != null) token = token.next;
else token = token.next = token_source.getNextToken();
@@ -782,10 +794,9 @@ public class NutchAnalysis implements Nu
throw generateParseException();
}
- @SuppressWarnings("serial")
static private final class LookaheadSuccess extends java.lang.Error { }
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
- final private boolean jj_scan_token(int kind) {
+ private boolean jj_scan_token(int kind) {
if (jj_scanpos == jj_lastpos) {
jj_la--;
if (jj_scanpos.next == null) {
@@ -806,6 +817,8 @@ public class NutchAnalysis implements Nu
return false;
}
+
+/** Get the next Token. */
final public Token getNextToken() {
if (token.next != null) token = token.next;
else token = token.next = token_source.getNextToken();
@@ -814,8 +827,9 @@ public class NutchAnalysis implements Nu
return token;
}
+/** Get the specific Token. */
final public Token getToken(int index) {
- Token t = lookingAhead ? jj_scanpos : token;
+ Token t = token;
for (int i = 0; i < index; i++) {
if (t.next != null) t = t.next;
else t = t.next = token_source.getNextToken();
@@ -823,14 +837,14 @@ public class NutchAnalysis implements Nu
return t;
}
- final private int jj_ntk() {
+ private int jj_ntk() {
if ((jj_nt=token.next) == null)
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
else
return (jj_ntk = jj_nt.kind);
}
- private java.util.Vector<int[]> jj_expentries = new java.util.Vector<int[]>();
+ private java.util.List jj_expentries = new java.util.ArrayList();
private int[] jj_expentry;
private int jj_kind = -1;
private int[] jj_lasttokens = new int[100];
@@ -845,31 +859,26 @@ public class NutchAnalysis implements Nu
for (int i = 0; i < jj_endpos; i++) {
jj_expentry[i] = jj_lasttokens[i];
}
- boolean exists = false;
- for (java.util.Enumeration<int[]> e = jj_expentries.elements(); e.hasMoreElements();) {
- int[] oldentry = (e.nextElement());
+ jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
+ int[] oldentry = (int[])(it.next());
if (oldentry.length == jj_expentry.length) {
- exists = true;
for (int i = 0; i < jj_expentry.length; i++) {
if (oldentry[i] != jj_expentry[i]) {
- exists = false;
- break;
+ continue jj_entries_loop;
}
}
- if (exists) break;
+ jj_expentries.add(jj_expentry);
+ break jj_entries_loop;
}
}
- if (!exists) jj_expentries.addElement(jj_expentry);
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
}
}
+ /** Generate ParseException. */
public ParseException generateParseException() {
- jj_expentries.removeAllElements();
+ jj_expentries.clear();
boolean[] la1tokens = new boolean[20];
- for (int i = 0; i < 20; i++) {
- la1tokens[i] = false;
- }
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
@@ -887,7 +896,7 @@ public class NutchAnalysis implements Nu
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
- jj_expentries.addElement(jj_expentry);
+ jj_expentries.add(jj_expentry);
}
}
jj_endpos = 0;
@@ -895,18 +904,20 @@ public class NutchAnalysis implements Nu
jj_add_error_token(0, 0);
int[][] exptokseq = new int[jj_expentries.size()][];
for (int i = 0; i < jj_expentries.size(); i++) {
- exptokseq[i] = jj_expentries.elementAt(i);
+ exptokseq[i] = (int[])jj_expentries.get(i);
}
return new ParseException(token, exptokseq, tokenImage);
}
+ /** Enable tracing. */
final public void enable_tracing() {
}
+ /** Disable tracing. */
final public void disable_tracing() {
}
- final private void jj_rescan_token() {
+ private void jj_rescan_token() {
jj_rescan = true;
for (int i = 0; i < 3; i++) {
try {
@@ -927,7 +938,7 @@ public class NutchAnalysis implements Nu
jj_rescan = false;
}
- final private void jj_save(int index, int xla) {
+ private void jj_save(int index, int xla) {
JJCalls p = jj_2_rtns[index];
while (p.gen > jj_gen) {
if (p.next == null) { p = p.next = new JJCalls(); break; }
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj Fri Mar 19 11:34:33 2010
@@ -38,6 +38,7 @@ import org.apache.nutch.util.NutchConfig
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.*;
import java.util.*;
@@ -277,7 +278,7 @@ ArrayList phrase(String field) :
/** Parse a compound term that is interpreted as an implicit phrase query.
* Compounds are a sequence of terms separated by infix characters. Note that
- * htis may return a single term, a trivial compound. */
+ * this may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
int start;
@@ -305,19 +306,23 @@ ArrayList compound(String field) :
result.add(queryString.substring(start, token.endColumn));
} else {
- org.apache.lucene.analysis.Token token;
TokenStream tokens = analyzer.tokenStream(
field, new StringReader(terms.toString()));
- while (true) {
- try {
- token = tokens.next();
- } catch (IOException e) {
- token = null;
+ TermAttribute ta = tokens.getAttribute(TermAttribute.class);
+ try
+ {
+ String termText;
+ while (tokens.incrementToken())
+ {
+ if ((termText = ta.term()) == null)
+ break;
+ result.add(termText);
}
- if (token == null) { break; }
- result.add(token.termText());
+ } catch (IOException e) {
+ // ignore (?)
}
+//
try {
tokens.close();
} catch (IOException e) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java Fri Mar 19 11:34:33 2010
@@ -1,31 +1,58 @@
/* Generated By:JavaCC: Do not edit this line. NutchAnalysisConstants.java */
package org.apache.nutch.analysis;
+
+/**
+ * Token literal values and constants.
+ * Generated by org.javacc.parser.OtherFilesGen#start()
+ */
public interface NutchAnalysisConstants {
+ /** End of File. */
int EOF = 0;
+ /** RegularExpression Id. */
int WORD = 1;
+ /** RegularExpression Id. */
int ACRONYM = 2;
+ /** RegularExpression Id. */
int SIGRAM = 3;
+ /** RegularExpression Id. */
int IRREGULAR_WORD = 4;
+ /** RegularExpression Id. */
int C_PLUS_PLUS = 5;
+ /** RegularExpression Id. */
int C_SHARP = 6;
+ /** RegularExpression Id. */
int PLUS = 7;
+ /** RegularExpression Id. */
int MINUS = 8;
+ /** RegularExpression Id. */
int QUOTE = 9;
+ /** RegularExpression Id. */
int COLON = 10;
+ /** RegularExpression Id. */
int SLASH = 11;
+ /** RegularExpression Id. */
int DOT = 12;
+ /** RegularExpression Id. */
int ATSIGN = 13;
+ /** RegularExpression Id. */
int APOSTROPHE = 14;
+ /** RegularExpression Id. */
int WHITE = 15;
+ /** RegularExpression Id. */
int WORD_PUNCT = 16;
+ /** RegularExpression Id. */
int LETTER = 17;
+ /** RegularExpression Id. */
int CJK = 18;
+ /** RegularExpression Id. */
int DIGIT = 19;
+ /** Lexical state. */
int DEFAULT = 0;
+ /** Literal token values. */
String[] tokenImage = {
"<EOF>",
"<WORD>",
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java Fri Mar 19 11:34:33 2010
@@ -1,23 +1,38 @@
/* Generated By:JavaCC: Do not edit this line. NutchAnalysisTokenManager.java */
package org.apache.nutch.analysis;
-
import java.io.*;
+/** Token Manager. */
public class NutchAnalysisTokenManager implements NutchAnalysisConstants
{
/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
this(new FastCharStream(reader));
}
+
+ /** Debug output. */
public java.io.PrintStream debugStream = System.out;
+ /** Set debug output. */
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
-private final int jjStopAtPos(int pos, int kind)
+private final int jjStopStringLiteralDfa_0(int pos, long active0)
+{
+ switch (pos)
+ {
+ default :
+ return -1;
+ }
+}
+private final int jjStartNfa_0(int pos, long active0)
+{
+ return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1);
+}
+private int jjStopAtPos(int pos, int kind)
{
jjmatchedKind = kind;
jjmatchedPos = pos;
return pos + 1;
}
-private final int jjMoveStringLiteralDfa0_0()
+private int jjMoveStringLiteralDfa0_0()
{
switch(curChar)
{
@@ -41,20 +56,6 @@ private final int jjMoveStringLiteralDfa
return jjMoveNfa_0(1, 0);
}
}
-private final void jjCheckNAdd(int state)
-{
- if (jjrounds[state] != jjround)
- {
- jjstateSet[jjnewStateCnt++] = state;
- jjrounds[state] = jjround;
- }
-}
-private final void jjAddStates(int start, int end)
-{
- do {
- jjstateSet[jjnewStateCnt++] = jjnextStates[start];
- } while (start++ != end);
-}
static final long[] jjbitVec0 = {
0xfffffffeL, 0x0L, 0x0L, 0x0L
};
@@ -76,7 +77,7 @@ static final long[] jjbitVec6 = {
static final long[] jjbitVec7 = {
0x3fffffffffffL, 0x0L, 0x0L, 0x0L
};
-private final int jjMoveNfa_0(int startState, int curPos)
+private int jjMoveNfa_0(int startState, int curPos)
{
int startsAt = 0;
jjnewStateCnt = 10;
@@ -257,26 +258,36 @@ private static final boolean jjCanMove_1
return false;
}
}
+
+/** Token literal values. */
public static final String[] jjstrLiteralImages = {
"", null, null, null, null, null, null, "\53", "\55", "\42", "\72", "\57",
"\56", "\100", "\47", null, null, null, null, null, };
+
+/** Lexer state names. */
public static final String[] lexStateNames = {
"DEFAULT",
};
protected CharStream input_stream;
private final int[] jjrounds = new int[10];
private final int[] jjstateSet = new int[20];
-StringBuffer image;
-int jjimageLen;
-int lengthOfMatch;
+private final StringBuffer jjimage = new StringBuffer();
+private StringBuffer image = jjimage;
+private int jjimageLen;
+private int lengthOfMatch;
protected char curChar;
+/** Constructor. */
public NutchAnalysisTokenManager(CharStream stream){
input_stream = stream;
}
+
+/** Constructor. */
public NutchAnalysisTokenManager(CharStream stream, int lexState){
this(stream);
SwitchTo(lexState);
}
+
+/** Reinitialise parser. */
public void ReInit(CharStream stream)
{
jjmatchedPos = jjnewStateCnt = 0;
@@ -284,18 +295,22 @@ public void ReInit(CharStream stream)
input_stream = stream;
ReInitRounds();
}
-private final void ReInitRounds()
+private void ReInitRounds()
{
int i;
jjround = 0x80000001;
for (i = 10; i-- > 0;)
jjrounds[i] = 0x80000000;
}
+
+/** Reinitialise parser. */
public void ReInit(CharStream stream, int lexState)
{
ReInit(stream);
SwitchTo(lexState);
}
+
+/** Switch to specified lex state. */
public void SwitchTo(int lexState)
{
if (lexState >= 1 || lexState < 0)
@@ -306,14 +321,25 @@ public void SwitchTo(int lexState)
protected Token jjFillToken()
{
- Token t = Token.newToken(jjmatchedKind);
- t.kind = jjmatchedKind;
+ final Token t;
+ final String curTokenImage;
+ final int beginLine;
+ final int endLine;
+ final int beginColumn;
+ final int endColumn;
String im = jjstrLiteralImages[jjmatchedKind];
- t.image = (im == null) ? input_stream.GetImage() : im;
- t.beginLine = input_stream.getBeginLine();
- t.beginColumn = input_stream.getBeginColumn();
- t.endLine = input_stream.getEndLine();
- t.endColumn = input_stream.getEndColumn();
+ curTokenImage = (im == null) ? input_stream.GetImage() : im;
+ beginLine = input_stream.getBeginLine();
+ beginColumn = input_stream.getBeginColumn();
+ endLine = input_stream.getEndLine();
+ endColumn = input_stream.getEndColumn();
+ t = Token.newToken(jjmatchedKind, curTokenImage);
+
+ t.beginLine = beginLine;
+ t.endLine = endLine;
+ t.beginColumn = beginColumn;
+ t.endColumn = endColumn;
+
return t;
}
@@ -324,11 +350,13 @@ int jjround;
int jjmatchedPos;
int jjmatchedKind;
+/** Get the next Token. */
public Token getNextToken()
{
Token matchedToken;
int curPos = 0;
+ EOFLoop :
for (;;)
{
try
@@ -341,7 +369,8 @@ public Token getNextToken()
matchedToken = jjFillToken();
return matchedToken;
}
- image = null;
+ image = jjimage;
+ image.setLength(0);
jjimageLen = 0;
jjmatchedKind = 0x7fffffff;
@@ -387,15 +416,11 @@ void TokenLexicalActions(Token matchedTo
switch(jjmatchedKind)
{
case 1 :
- if (image == null)
- image = new StringBuffer();
- image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
+ image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
matchedToken.image = matchedToken.image.toLowerCase();
break;
case 2 :
- if (image == null)
- image = new StringBuffer();
- image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
+ image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
// remove dots
for (int i = 0; i < image.length(); i++) {
if (image.charAt(i) == '.')
@@ -407,4 +432,24 @@ void TokenLexicalActions(Token matchedTo
break;
}
}
+private void jjCheckNAdd(int state)
+{
+ if (jjrounds[state] != jjround)
+ {
+ jjstateSet[jjnewStateCnt++] = state;
+ jjrounds[state] = jjround;
+ }
+}
+private void jjAddStates(int start, int end)
+{
+ do {
+ jjstateSet[jjnewStateCnt++] = jjnextStates[start];
+ } while (start++ != end);
+}
+private void jjCheckNAddTwoStates(int state1, int state2)
+{
+ jjCheckNAdd(state1);
+ jjCheckNAdd(state2);
+}
+
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -24,7 +24,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.hadoop.conf.Configuration;
/**
@@ -71,21 +71,24 @@ public class NutchDocumentAnalyzer exten
}
private static class AnchorFilter extends TokenFilter {
+ private final PositionIncrementAttribute posAttr;
private boolean first = true;
public AnchorFilter(TokenStream input) {
super(input);
+ // The super filter must have positional information.
+ posAttr = input.getAttribute(PositionIncrementAttribute.class);
}
- public final Token next() throws IOException {
- Token result = input.next();
- if (result == null)
- return result;
- if (first) {
- result.setPositionIncrement(INTER_ANCHOR_GAP);
- first = false;
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) {
+ if (first) {
+ posAttr.setPositionIncrement(INTER_ANCHOR_GAP);
+ first = false;
+ }
}
- return result;
+ return false;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java Fri Mar 19 11:34:33 2010
@@ -19,8 +19,9 @@ package org.apache.nutch.analysis;
import java.io.*;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
/** The tokenizer used for Nutch document text. Implemented in terms of our
* JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared
@@ -28,18 +29,27 @@ import org.apache.lucene.analysis.Token;
*/
public final class NutchDocumentTokenizer extends Tokenizer
implements NutchAnalysisConstants {
-
- private NutchAnalysisTokenManager tokenManager;
+
+ private final NutchAnalysisTokenManager tokenManager;
+
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+ private final TypeAttribute typeAtt;
+ private final OffsetAttribute offsetAtt;
/** Construct a tokenizer for the text in a Reader. */
public NutchDocumentTokenizer(Reader reader) {
super(reader);
+
tokenManager = new NutchAnalysisTokenManager(reader);
+ this.termAtt = addAttribute(TermAttribute.class);
+ this.offsetAtt = addAttribute(OffsetAttribute.class);
+ this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ this.typeAtt = addAttribute(TypeAttribute.class);
}
-
- /** Returns the next token in the stream, or null at EOF. */
- public final Token next() throws IOException {
+ /** Returns the next token in the stream, or null at EOF. */
+ private final Token next() throws IOException {
org.apache.nutch.analysis.Token t;
try {
@@ -64,6 +74,23 @@ public final class NutchDocumentTokenize
}
}
+ /** Lucene 3.0 API. */
+ public boolean incrementToken() throws IOException
+ {
+ clearAttributes();
+
+ final Token t = next();
+ if (t != null) {
+ termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+ offsetAtt.setOffset(t.startOffset(), t.endOffset());
+ posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+ typeAtt.setType(t.type());
+ return true;
+ } else {
+ return false;
+ }
+ }
+
/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
@@ -71,14 +98,13 @@ public final class NutchDocumentTokenize
System.out.print("Text: ");
String line = in.readLine();
Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
- Token token;
+ TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
System.out.print("Tokens: ");
- while ((token = tokenizer.next()) != null) {
- System.out.print(token.termText());
+ while (tokenizer.incrementToken()) {
+ System.out.print(termAtt.term());
System.out.print(" ");
}
System.out.println();
}
}
-
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/ParseException.java Fri Mar 19 11:34:33 2010
@@ -1,4 +1,4 @@
-/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 3.0 */
+/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
package org.apache.nutch.analysis;
/**
@@ -11,7 +11,7 @@ package org.apache.nutch.analysis;
* mechanisms so long as you retain the public fields.
*/
@SuppressWarnings("serial")
-class ParseException extends java.io.IOException {
+public class ParseException extends java.io.IOException {
/**
* This constructor is used by the method "generateParseException"
@@ -52,6 +52,7 @@ class ParseException extends java.io.IOE
specialConstructor = false;
}
+ /** Constructor with message. */
public ParseException(String message) {
super(message);
specialConstructor = false;
@@ -99,19 +100,19 @@ class ParseException extends java.io.IOE
if (!specialConstructor) {
return super.getMessage();
}
- String expected = "";
+ StringBuffer expected = new StringBuffer();
int maxSize = 0;
for (int i = 0; i < expectedTokenSequences.length; i++) {
if (maxSize < expectedTokenSequences[i].length) {
maxSize = expectedTokenSequences[i].length;
}
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
- expected += tokenImage[expectedTokenSequences[i][j]] + " ";
+ expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
}
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
- expected += "...";
+ expected.append("...");
}
- expected += eol + " ";
+ expected.append(eol).append(" ");
}
String retval = "Encountered \"";
Token tok = currentToken.next;
@@ -121,7 +122,10 @@ class ParseException extends java.io.IOE
retval += tokenImage[0];
break;
}
+ retval += " " + tokenImage[tok.kind];
+ retval += " \"";
retval += add_escapes(tok.image);
+ retval += " \"";
tok = tok.next;
}
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
@@ -131,7 +135,7 @@ class ParseException extends java.io.IOE
} else {
retval += "Was expecting one of:" + eol + " ";
}
- retval += expected;
+ retval += expected.toString();
return retval;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/Token.java Fri Mar 19 11:34:33 2010
@@ -1,4 +1,4 @@
-/* Generated By:JavaCC: Do not edit this line. Token.java Version 3.0 */
+/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
package org.apache.nutch.analysis;
/**
@@ -14,12 +14,14 @@ class Token {
*/
public int kind;
- /**
- * beginLine and beginColumn describe the position of the first character
- * of this token; endLine and endColumn describe the position of the
- * last character of this token.
- */
- public int beginLine, beginColumn, endLine, endColumn;
+ /** The line number of the first character of this Token. */
+ public int beginLine;
+ /** The column number of the first character of this Token. */
+ public int beginColumn;
+ /** The line number of the last character of this Token. */
+ public int endLine;
+ /** The column number of the last character of this Token. */
+ public int endColumn;
/**
* The string image of the token.
@@ -51,6 +53,40 @@ class Token {
public Token specialToken;
/**
+ * An optional attribute value of the Token.
+ * Tokens which are not used as syntactic sugar will often contain
+ * meaningful values that will be used later on by the compiler or
+ * interpreter. This attribute value is often different from the image.
+ * Any subclass of Token that actually wants to return a non-null value can
+ * override this method as appropriate.
+ */
+ public Object getValue() {
+ return null;
+ }
+
+ /**
+ * No-argument constructor
+ */
+ public Token() {}
+
+ /**
+ * Constructs a new token for the specified Image.
+ */
+ public Token(int kind)
+ {
+ this(kind, null);
+ }
+
+ /**
+ * Constructs a new token for the specified Image and Kind.
+ */
+ public Token(int kind, String image)
+ {
+ this.kind = kind;
+ this.image = image;
+ }
+
+ /**
* Returns the image.
*/
public String toString()
@@ -63,19 +99,25 @@ class Token {
* can create and return subclass objects based on the value of ofKind.
* Simply add the cases to the switch for all those special cases.
* For example, if you have a subclass of Token called IDToken that
- * you want to create if ofKind is ID, simlpy add something like :
+ * you want to create if ofKind is ID, simply add something like :
*
- * case MyParserConstants.ID : return new IDToken();
+ * case MyParserConstants.ID : return new IDToken(ofKind, image);
*
* to the following switch statement. Then you can cast matchedToken
- * variable to the appropriate type and use it in your lexical actions.
+ * variable to the appropriate type and use sit in your lexical actions.
*/
- public static final Token newToken(int ofKind)
+ public static Token newToken(int ofKind, String image)
{
switch(ofKind)
{
- default : return new Token();
+ default : return new Token(ofKind, image);
}
}
+ public static Token newToken(int ofKind)
+ {
+ return newToken(ofKind, null);
+ }
+
}
+/* JavaCC - OriginalChecksum=6925860b4b6a41d42c759eab47d0d3a3 (do not edit this line) */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Fri Mar 19 11:34:33 2010
@@ -379,7 +379,7 @@ public class DeleteDuplicates extends Co
OutputCollector<WritableComparable, Writable> output, Reporter reporter)
throws IOException {
Path index = new Path(key.toString());
- IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
+ IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()), false);
try {
while (values.hasNext()) {
IntWritable value = values.next();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java Fri Mar 19 11:34:33 2010
@@ -64,7 +64,7 @@ public class FsDirectory extends Directo
}
}
- public String[] list() throws IOException {
+ public String[] listAll() throws IOException {
FileStatus[] fstats = fs.listStatus(directory, HadoopFSUtil.getPassAllFilter());
Path[] files = HadoopFSUtil.getPaths(fstats);
if (files == null) return null;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java Fri Mar 19 11:34:33 2010
@@ -17,11 +17,13 @@
package org.apache.nutch.indexer;
+import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
+import java.io.File;
import java.io.OutputStreamWriter;
/** Lists the most frequent terms in an index. */
@@ -37,14 +39,12 @@ public class HighFreqTerms {
Term term;
}
- private static class TermFreqQueue extends PriorityQueue {
+ private static class TermFreqQueue extends PriorityQueue<TermFreq> {
TermFreqQueue(int size) {
initialize(size);
}
- protected final boolean lessThan(Object a, Object b) {
- TermFreq termInfoA = (TermFreq)a;
- TermFreq termInfoB = (TermFreq)b;
+ protected final boolean lessThan(TermFreq termInfoA, TermFreq termInfoB) {
return termInfoA.docFreq < termInfoB.docFreq;
}
}
@@ -66,7 +66,7 @@ public class HighFreqTerms {
} else if (args[i].equals("-nofreqs")) { // found -nofreqs option
noFreqs = true;
} else {
- reader = IndexReader.open(args[i]);
+ reader = IndexReader.open(FSDirectory.open(new File(args[i])));
}
}
@@ -76,10 +76,10 @@ public class HighFreqTerms {
int minFreq = 0;
while (terms.next()) {
if (terms.docFreq() > minFreq) {
- tiq.put(new TermFreq(terms.term(), terms.docFreq()));
+ TermFreq top = tiq.add(new TermFreq(terms.term(), terms.docFreq()));
if (tiq.size() >= count) { // if tiq overfull
tiq.pop(); // remove lowest in tiq
- minFreq = ((TermFreq)tiq.top()).docFreq; // reset minFreq
+ minFreq = top.docFreq; // reset minFreq
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Fri Mar 19 11:34:33 2010
@@ -33,7 +33,10 @@ import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
/*************************************************************************
* IndexMerger creates an index for the output corresponding to a
@@ -86,15 +89,18 @@ public class IndexMerger extends Configu
//
// Merge indices
//
- IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
- writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+ IndexWriter writer = new IndexWriter(
+ FSDirectory.open(new File(localOutput.toString())), null, true,
+ MaxFieldLength.UNLIMITED);
+ writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR));
writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
- writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+ writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS));
writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
writer.setInfoStream(LogUtil.getDebugStream(LOG));
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
- writer.addIndexes(dirs);
+ writer.addIndexesNoOptimize(dirs);
+ writer.optimize();
writer.close();
//
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.util.Date;
import java.util.Arrays;
import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.document.*;
import org.apache.lucene.store.*;
import org.apache.lucene.search.*;
@@ -188,7 +189,7 @@ public class IndexSorter extends Configu
}
public Document document(int n) throws IOException {
- return super.document(newToOld[n]);
+ return document(n, null);
}
public Document document(int n, FieldSelector fieldSelector)
@@ -263,11 +264,13 @@ public class IndexSorter extends Configu
LOG.info("IndexSorter: starting.");
Date start = new Date();
int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128);
- IndexReader reader = IndexReader.open(new File(directory, "index"));
+ IndexReader reader = IndexReader.open(
+ FSDirectory.open(new File(directory, "index")));
SortingReader sorter = new SortingReader(reader, oldToNew(reader));
- IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"),
- null, true);
+ IndexWriter writer = new IndexWriter(
+ FSDirectory.open(new File(directory, "index-sorted")),
+ null, true, MaxFieldLength.UNLIMITED);
writer.setTermIndexInterval
(termIndexInterval);
writer.setUseCompoundFile(false);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/field/FieldIndexer.java Fri Mar 19 11:34:33 2010
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.field;
import java.io.DataInput;
import java.io.DataOutput;
+import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -57,6 +58,8 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.analysis.NutchAnalyzer;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
@@ -115,8 +118,10 @@ public class FieldIndexer
final AnalyzerFactory factory = new AnalyzerFactory(job);
final IndexWriter writer = // build locally first
- new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
- new NutchDocumentAnalyzer(job), true);
+ new IndexWriter(
+ FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
+ new NutchDocumentAnalyzer(job), true,
+ new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java Fri Mar 19 11:34:33 2010
@@ -35,10 +35,13 @@ public interface LuceneConstants {
public static final String INDEX_NO = "index.no";
+ // TODO: -> ANALYZED_NO_NORMS
public static final String INDEX_NO_NORMS = "index.no_norms";
+ // TODO: -> ANALYZED
public static final String INDEX_TOKENIZED = "index.tokenized";
+ // TODO: -> NOT_ANALYZED
public static final String INDEX_UNTOKENIZED = "index.untokenized";
public static final String VECTOR_NO = "vector.no";
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java Fri Mar 19 11:34:33 2010
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.indexer.lucene;
+import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
@@ -32,6 +33,8 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.analysis.NutchAnalyzer;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
@@ -108,13 +111,13 @@ public class LuceneWriter implements Nut
} else if (LuceneConstants.STORE_NO.equals(val)) {
store = Field.Store.NO;
} else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) {
- index = Field.Index.TOKENIZED;
+ index = Field.Index.ANALYZED;
} else if (LuceneConstants.INDEX_NO.equals(val)) {
index = Field.Index.NO;
} else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) {
- index = Field.Index.UN_TOKENIZED;
+ index = Field.Index.NOT_ANALYZED;
} else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) {
- index = Field.Index.NO_NORMS;
+ index = Field.Index.ANALYZED_NO_NORMS;
} else if (LuceneConstants.VECTOR_NO.equals(val)) {
vector = Field.TermVector.NO;
} else if (LuceneConstants.VECTOR_YES.equals(val)) {
@@ -151,14 +154,12 @@ public class LuceneWriter implements Nut
final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key));
switch (store) {
case YES:
+ case COMPRESS:
fieldStore.put(field, Field.Store.YES);
break;
case NO:
fieldStore.put(field, Field.Store.NO);
break;
- case COMPRESS:
- fieldStore.put(field, Field.Store.COMPRESS);
- break;
}
} else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) {
final String field =
@@ -169,13 +170,13 @@ public class LuceneWriter implements Nut
fieldIndex.put(field, Field.Index.NO);
break;
case NO_NORMS:
- fieldIndex.put(field, Field.Index.NO_NORMS);
+ fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS);
break;
case TOKENIZED:
- fieldIndex.put(field, Field.Index.TOKENIZED);
+ fieldIndex.put(field, Field.Index.ANALYZED);
break;
case UNTOKENIZED:
- fieldIndex.put(field, Field.Index.UN_TOKENIZED);
+ fieldIndex.put(field, Field.Index.NOT_ANALYZED);
break;
}
} else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) {
@@ -212,8 +213,9 @@ public class LuceneWriter implements Nut
fs.delete(perm, true); // delete old, if any
analyzerFactory = new AnalyzerFactory(job);
- writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
- new NutchDocumentAnalyzer(job), true);
+ writer = new IndexWriter(
+ FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
+ new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);
writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
@@ -266,8 +268,6 @@ public class LuceneWriter implements Nut
final Metadata documentMeta = doc.getDocumentMeta();
if (f.isStored()) {
documentMeta.add(key, LuceneConstants.STORE_YES);
- } else if (f.isCompressed()) {
- documentMeta.add(key, LuceneConstants.STORE_COMPRESS);
} else {
documentMeta.add(key, LuceneConstants.STORE_NO);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Fri Mar 19 11:34:33 2010
@@ -107,4 +107,4 @@ public class MetaWrapper extends NutchWr
super.write(out);
metadata.write(out);
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Fri Mar 19 11:34:33 2010
@@ -120,4 +120,4 @@ public class LoopReader {
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Fri Mar 19 11:34:33 2010
@@ -603,4 +603,4 @@ public class Loops
return -2;
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Fri Mar 19 11:34:33 2010
@@ -119,4 +119,4 @@ public class NodeReader {
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Fri Mar 19 11:34:33 2010
@@ -105,4 +105,4 @@ public class DistributedSearch {
server.join();
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Fri Mar 19 11:34:33 2010
@@ -30,6 +30,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.search.FieldCache;
@@ -83,7 +84,7 @@ public class IndexSearcher implements Se
if ("file".equals(this.fs.getUri().getScheme())) {
Path qualified = file.makeQualified(FileSystem.getLocal(conf));
File fsLocal = new File(qualified.toUri());
- return FSDirectory.getDirectory(fsLocal.getAbsolutePath());
+ return FSDirectory.open(new File(fsLocal.getAbsolutePath()));
} else {
return new FsDirectory(this.fs, file, false, this.conf);
}
@@ -120,11 +121,11 @@ public class IndexSearcher implements Se
Document doc = luceneSearcher.doc(Integer.valueOf(hit.getUniqueKey()));
- List docFields = doc.getFields();
+ List<Fieldable> docFields = doc.getFields();
String[] fields = new String[docFields.size()];
String[] values = new String[docFields.size()];
for (int i = 0; i < docFields.size(); i++) {
- Field field = (Field)docFields.get(i);
+ Fieldable field = docFields.get(i);
fields[i] = field.name();
values[i] = field.stringValue();
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Fri Mar 19 11:34:33 2010
@@ -17,18 +17,14 @@
package org.apache.nutch.searcher;
-import org.apache.lucene.search.Searcher;
-import org.apache.lucene.search.*;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.misc.ChainedFilter;
+import java.io.IOException;
+import java.util.*;
import org.apache.hadoop.conf.Configuration;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.ArrayList;
-
-import java.io.IOException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.misc.ChainedFilter;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.Searcher;
/** Utility which converts certain query clauses into {@link QueryFilter}s and
* caches these. Only required clauses whose boost is zero are converted to
@@ -93,16 +89,18 @@ class LuceneQueryOptimizer {
}
}
- private static class LimitedCollector extends TopDocCollector {
+ private static class LimitedCollector extends Collector {
private int maxHits;
private int maxTicks;
private int startTicks;
private TimerThread timer;
private int curTicks;
+ private TopDocsCollector<ScoreDoc> delegate;
public LimitedCollector(int numHits, int maxHits, int maxTicks,
TimerThread timer) {
- super(numHits);
+ final boolean docsScoredInOrder = true;
+ delegate = TopScoreDocCollector.create(numHits, docsScoredInOrder);
this.maxHits = maxHits;
this.maxTicks = maxTicks;
if (timer != null) {
@@ -111,8 +109,14 @@ class LuceneQueryOptimizer {
}
}
- public void collect(int doc, float score) {
- if (maxHits > 0 && getTotalHits() >= maxHits) {
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return delegate.acceptsDocsOutOfOrder();
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ if (maxHits > 0 && delegate.getTotalHits() >= maxHits) {
throw new LimitExceeded(doc);
}
if (timer != null) {
@@ -123,7 +127,22 @@ class LuceneQueryOptimizer {
throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc);
}
}
- super.collect(doc, score);
+ delegate.collect(doc);
+ }
+
+ @Override
+ public void setNextReader(IndexReader r, int base)
+ throws IOException {
+ delegate.setNextReader(r, base);
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ delegate.setScorer(scorer);
+ }
+
+ public TopDocs topDocs() {
+ return delegate.topDocs();
}
}
@@ -193,15 +212,11 @@ public LuceneQueryOptimizer(Configuratio
continue;
}
- if (c.getQuery() instanceof RangeQuery) { // RangeQuery
- RangeQuery range = (RangeQuery)c.getQuery();
- boolean inclusive = range.isInclusive();// convert to RangeFilter
- Term lower = range.getLowerTerm();
- Term upper = range.getUpperTerm();
- filters.add(new RangeFilter(lower!=null?lower.field():upper.field(),
- lower != null ? lower.text() : null,
- upper != null ? upper.text() : null,
- inclusive, inclusive));
+ if (c.getQuery() instanceof TermRangeQuery) { // RangeQuery
+ TermRangeQuery range = (TermRangeQuery)c.getQuery();
+ filters.add(new TermRangeFilter(range.getField(),
+ range.getLowerTerm(), range.getUpperTerm(),
+ range.includesLower(), range.includesUpper()));
cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
continue;
}
@@ -271,7 +286,7 @@ public LuceneQueryOptimizer(Configuratio
} else {
return searcher.search(query, filter, numHits,
- new Sort(sortField, reverse));
+ new Sort(new SortField(sortField, SortField.STRING, reverse)));
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Fri Mar 19 11:34:33 2010
@@ -378,7 +378,7 @@ HitInlinks, Closeable {
/** For debugging. */
public static void main(String[] args) throws Exception {
- final String usage = "NutchBean query";
+ final String usage = "NutchBean query [<searcher.dir>]";
if (args.length == 0) {
System.err.println(usage);
@@ -386,6 +386,9 @@ HitInlinks, Closeable {
}
final Configuration conf = NutchConfiguration.create();
+ if (args.length > 1) {
+ conf.set("searcher.dir", args[1]);
+ }
final NutchBean bean = new NutchBean(conf);
try {
final Query query = Query.parse(args[0], conf);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java Fri Mar 19 11:34:33 2010
@@ -103,4 +103,4 @@ public class SegmentPart {
String part = string.substring(idx + 1);
return new SegmentPart(segment, part);
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java Fri Mar 19 11:34:33 2010
@@ -22,38 +22,21 @@
package org.apache.nutch.tools;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileFilter;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.util.BitSet;
-import java.util.StringTokenizer;
-import java.util.Vector;
+import java.io.*;
+import java.util.*;
-// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.HitCollector;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.nutch.util.NutchConfiguration;
/**
* This tool prunes existing Nutch indexes of unwanted content. The main method
@@ -253,13 +236,13 @@ public class PruneIndexTool implements R
if (dryrun) dr = "[DRY RUN] ";
int numIdx = 0;
if (indexDirs.length == 1) {
- Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
- if (IndexReader.isLocked(dir)) {
+ Directory dir = FSDirectory.open(indexDirs[0]);
+ if (IndexWriter.isLocked(dir)) {
if (!unlock) {
throw new Exception("Index " + indexDirs[0] + " is locked.");
}
if (!dryrun) {
- IndexReader.unlock(dir);
+ IndexWriter.unlock(dir);
if (LOG.isDebugEnabled()) {
LOG.debug(" - had to unlock index in " + dir);
}
@@ -272,8 +255,8 @@ public class PruneIndexTool implements R
Vector<IndexReader> indexes = new Vector<IndexReader>(indexDirs.length);
for (int i = 0; i < indexDirs.length; i++) {
try {
- dir = FSDirectory.getDirectory(indexDirs[i], false);
- if (IndexReader.isLocked(dir)) {
+ dir = FSDirectory.open(indexDirs[i]);
+ if (IndexWriter.isLocked(dir)) {
if (!unlock) {
if (LOG.isWarnEnabled()) {
LOG.warn(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
@@ -281,7 +264,7 @@ public class PruneIndexTool implements R
continue;
}
if (!dryrun) {
- IndexReader.unlock(dir);
+ IndexWriter.unlock(dir);
if (LOG.isDebugEnabled()) {
LOG.debug(" - had to unlock index in " + dir);
}
@@ -315,15 +298,31 @@ public class PruneIndexTool implements R
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
- private static class AllHitsCollector extends HitCollector {
+ private static class AllHitsCollector extends Collector {
private BitSet bits;
public AllHitsCollector(BitSet bits) {
this.bits = bits;
}
- public void collect(int doc, float score) {
+
+ public void collect(int doc) {
bits.set(doc);
}
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ @Override
+ public void setNextReader(IndexReader paramIndexReader, int paramInt) throws IOException {
+ // Do nothing.
+ }
+
+ @Override
+ public void setScorer(Scorer paramScorer) throws IOException {
+ // Do nothing.
+ }
}
/**
@@ -415,7 +414,7 @@ public class PruneIndexTool implements R
return;
}
Vector<File> paths = new Vector<File>();
- if (IndexReader.indexExists(idx)) {
+ if (IndexReader.indexExists(FSDirectory.open(idx))) {
paths.add(idx);
} else {
// try and see if there are segments inside, with index dirs
@@ -431,7 +430,8 @@ public class PruneIndexTool implements R
}
for (int i = 0; i < dirs.length; i++) {
File sidx = new File(dirs[i], "index");
- if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
+ if (sidx.exists() && sidx.isDirectory()
+ && IndexReader.indexExists(FSDirectory.open(sidx))) {
paths.add(sidx);
}
}
@@ -534,7 +534,7 @@ public class PruneIndexTool implements R
public static Query[] parseQueries(InputStream is) throws Exception {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line = null;
- QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
+ QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "url", new WhitespaceAnalyzer());
Vector<Query> queries = new Vector<Query>();
while ((line = br.readLine()) != null) {
line = line.trim();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java Fri Mar 19 11:34:33 2010
@@ -309,4 +309,4 @@ public class ReprUrlFixer
return -1;
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java (original)
+++ lucene/nutch/trunk/src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.io.Reader;
// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
// Nutch imports
import org.apache.nutch.analysis.NutchAnalyzer;
@@ -35,7 +36,7 @@ import org.apache.nutch.analysis.NutchAn
public class GermanAnalyzer extends NutchAnalyzer {
private final static Analyzer ANALYZER =
- new org.apache.lucene.analysis.de.GermanAnalyzer();
+ new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_CURRENT);
/** Creates a new instance of FrenchAnalyzer */
Modified: lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/nutch/trunk/src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java Fri Mar 19 11:34:33 2010
@@ -23,6 +23,7 @@ import java.io.Reader;
// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
// Nutch imports
import org.apache.nutch.analysis.NutchAnalyzer;
@@ -35,7 +36,7 @@ import org.apache.nutch.analysis.NutchAn
public class FrenchAnalyzer extends NutchAnalyzer {
private final static Analyzer ANALYZER =
- new org.apache.lucene.analysis.fr.FrenchAnalyzer();
+ new org.apache.lucene.analysis.fr.FrenchAnalyzer(Version.LUCENE_CURRENT);
/** Creates a new instance of FrenchAnalyzer */