You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/03/19 12:34:34 UTC
svn commit: r925179 [2/2] - in /lucene/nutch/trunk: ./ lib/
src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/indexer/field/
src/java/org/apache/nutch/indexer/lucene/
src/java/org/apache/nutch/metadata/ sr...
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java Fri Mar 19 11:34:33 2010
@@ -20,6 +20,7 @@ package org.creativecommons.nutch;
import org.apache.nutch.indexer.Indexer;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
import org.apache.commons.logging.Log;
@@ -83,7 +84,7 @@ public class CCDeleteUnlicensedTool {
File indexDone = new File(directories[i], Indexer.DONE_NAME);
if (indexDone.exists() && indexDone.isFile()){
File indexDir = new File(directories[i], "index");
- IndexReader reader = IndexReader.open(indexDir);
+ IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
maxDoc += reader.maxDoc();
vReaders.add(reader);
}
Modified: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Fri Mar 19 11:34:33 2010
@@ -80,8 +80,10 @@ public class BasicFieldFilter
// create lucene fields from the FieldWritable objects
Field.Store store = field.isStored() ? Field.Store.YES
: Field.Store.NO;
- Field.Index indexed = field.isIndexed() ? field.isTokenized()
- ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED : Field.Index.NO;
+ Field.Index indexed =
+ field.isIndexed()
+ ? field.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED
+ : Field.Index.NO;
Field docField = new Field(fieldName, field.getValue(), store,
indexed);
Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Fri Mar 19 11:34:33 2010
@@ -25,11 +25,11 @@
<plugin
id="lib-lucene-analyzers"
name="Lucene Analysers"
- version="2.9.1"
+ version="3.0.1"
provider-name="org.apache.lucene">
<runtime>
- <library name="lucene-analyzers-2.9.1.jar">
+ <library name="lucene-analyzers-3.0.1.jar">
<export name="*"/>
</library>
</runtime>
Modified: lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java Fri Mar 19 11:34:33 2010
@@ -29,8 +29,7 @@ import org.apache.commons.logging.LogFac
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.RangeQuery;
-import org.apache.lucene.index.Term;
+import org.apache.lucene.search.TermRangeQuery;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
@@ -74,11 +73,12 @@ public class DateQueryFilter implements
}
// do it as lucene RangeQuery
- Term xLower = new Term(FIELD_NAME, matcher.group(1));
- Term xUpper = new Term(FIELD_NAME, matcher.group(2));
+ String xLower = matcher.group(1);
+ String xUpper = matcher.group(2);
// inclusive
- RangeQuery rangeQuery = new RangeQuery(xLower, xUpper, true);
+ TermRangeQuery rangeQuery = new TermRangeQuery(
+ c.getField(), xLower, xUpper, true, true);
rangeQuery.setBoost(0.0f); // trigger filterization
Modified: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (original)
+++ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java Fri Mar 19 11:34:33 2010
@@ -39,6 +39,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.*;
// Nutch imports
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
@@ -152,7 +153,7 @@ public class BasicSummarizer implements
//
// If we find a term that's in the query...
//
- if (highlight.contains(tokens[i].termText())) {
+ if (highlight.contains(tokens[i].term())) {
//
// Start searching at a point SUM_CONTEXT terms back,
// and move SUM_CONTEXT terms into the future.
@@ -182,8 +183,8 @@ public class BasicSummarizer implements
// Now grab the hit-element, if present
//
Token t = tokens[j];
- if (highlight.contains(t.termText())) {
- excerpt.addToken(t.termText());
+ if (highlight.contains(t.term())) {
+ excerpt.addToken(t.term());
excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
offset = t.endOffset();
@@ -354,18 +355,25 @@ public class BasicSummarizer implements
private Token[] getTokens(String text) {
- ArrayList result = new ArrayList();
+ ArrayList<Token> result = new ArrayList<Token>();
TokenStream ts = analyzer.tokenStream("content", new StringReader(text));
- Token token = null;
- while (result.size()<token_deep) {
- try {
- token = ts.next();
- } catch (IOException e) {
- token = null;
+ TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
+ TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
+ try {
+ while (result.size() < token_deep && ts.incrementToken()) {
+ final Token token = new Token(
+ termAtt.termBuffer(), 0, termAtt.termLength(),
+ offsetAtt.startOffset(), offsetAtt.endOffset());
+ token.setType(typeAtt.type());
+ token.setPositionIncrement(posIncrAtt.getPositionIncrement());
+ result.add(token);
}
- if (token == null) { break; }
- result.add(token);
+ } catch (IOException e) {
+ // Ignore (?)
}
+
try {
ts.close();
} catch (IOException e) {
Added: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar?rev=925179&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Fri Mar 19 11:34:33 2010
@@ -25,7 +25,7 @@
<library name="summary-lucene.jar">
<export name="*"/>
</library>
- <library name="lucene-highlighter-2.9.1.jar"/>
+ <library name="lucene-highlighter-3.0.1.jar"/>
</runtime>
<requires>
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Fri Mar 19 11:34:33 2010
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.indexer;
+import java.io.File;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
@@ -62,7 +63,7 @@ public class TestDeleteDuplicates extend
private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception {
Path idx = new Path(root, name);
Path sub = new Path(idx, "part-0000");
- Directory dir = FSDirectory.getDirectory(sub.toString());
+ Directory dir = FSDirectory.open(new File(sub.toString()));
IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true,
MaxFieldLength.UNLIMITED);
Document doc = makeDoc(name,
@@ -89,7 +90,7 @@ public class TestDeleteDuplicates extend
private Path createSingleDocIndex(String name, float inc, long time) throws Exception {
Path idx = new Path(root, name);
Path sub = new Path(idx, "part-0000");
- Directory dir = FSDirectory.getDirectory(sub.toString());
+ Directory dir = FSDirectory.open(new File(sub.toString()));
IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true,
MaxFieldLength.UNLIMITED);
Document doc = makeDoc(name,
@@ -105,7 +106,7 @@ public class TestDeleteDuplicates extend
Document doc = new Document();
doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO));
- doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(new Field("url", url, Field.Store.YES, Field.Index.ANALYZED));
doc.setBoost(boost);
doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO));
doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO));
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java?rev=925179&r1=925178&r2=925179&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java Fri Mar 19 11:34:33 2010
@@ -29,6 +29,7 @@ import org.apache.lucene.document.Field.
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@@ -67,8 +68,9 @@ public class TestIndexSorter extends Tes
}
LOG.info("Creating test index: " + testDir.getAbsolutePath());
File plain = new File(testDir, INDEX_PLAIN);
- Directory dir = FSDirectory.getDirectory(plain);
- IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+ Directory dir = FSDirectory.open(plain);
+ IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true,
+ MaxFieldLength.UNLIMITED);
// create test documents
for (int i = 0; i < NUM_DOCS; i++) {
Document doc = new Document();
@@ -79,19 +81,19 @@ public class TestIndexSorter extends Tes
String val = null;
if (fieldNames[k].equals("id")) {
s = Store.YES;
- ix = Index.UN_TOKENIZED;
+ ix = Index.NOT_ANALYZED;
val = String.valueOf(i);
} else if (fieldNames[k].equals("host")) {
s = Store.YES;
- ix = Index.UN_TOKENIZED;
+ ix = Index.NOT_ANALYZED;
val = "www.example" + i + ".com";
} else if (fieldNames[k].equals("site")) {
s = Store.NO;
- ix = Index.UN_TOKENIZED;
+ ix = Index.NOT_ANALYZED;
val = "www.example" + i + ".com";
} else if (fieldNames[k].equals("content")) {
s = Store.NO;
- ix = Index.TOKENIZED;
+ ix = Index.ANALYZED;
val = "This is the content of the " + i + "-th document.";
} else if (fieldNames[k].equals("boost")) {
s = Store.YES;
@@ -104,7 +106,7 @@ public class TestIndexSorter extends Tes
doc.setBoost(boost);
} else {
s = Store.YES;
- ix = Index.TOKENIZED;
+ ix = Index.ANALYZED;
if (fieldNames[k].equals("anchor")) {
val = "anchors to " + i + "-th page.";
} else if (fieldNames[k].equals("url")) {
@@ -127,8 +129,9 @@ public class TestIndexSorter extends Tes
public void testSorting() throws Exception {
IndexSorter sorter = new IndexSorter(conf);
sorter.sort(testDir);
+
// read back documents
- IndexReader reader = IndexReader.open(new File(testDir, INDEX_SORTED));
+ IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED)));
assertEquals(reader.numDocs(), NUM_DOCS);
for (int i = 0; i < reader.maxDoc(); i++) {
Document doc = reader.document(i);