You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/11 23:30:25 UTC
svn commit: r213607 - in /lucene/nutch/branches/mapred: ./ conf/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/searcher/
Author: cutting
Date: Mon Jul 11 14:30:22 2005
New Revision: 213607
URL: http://svn.apache.org/viewcvs?rev=213607&view=rev
Log:
Get search working on NDFS-resident, MapReduce-created crawl.
Modified:
lucene/nutch/branches/mapred/build.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
Modified: lucene/nutch/branches/mapred/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/build.xml (original)
+++ lucene/nutch/branches/mapred/build.xml Mon Jul 11 14:30:22 2005
@@ -119,7 +119,7 @@
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
- <target name="war" depends="compile,generate-docs">
+ <target name="war" depends="jar,compile,generate-docs">
<war destfile="${build.dir}/${final.name}.war"
webxml="${web.src.dir}/web.xml">
<fileset dir="${web.src.dir}/jsp"/>
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 11 14:30:22 2005
@@ -498,9 +498,9 @@
<property>
<name>searcher.dir</name>
- <value>.</value>
+ <value>crawl</value>
<description>
- Path to root of index directories. This directory is searched (in
+ Path to root of crawl. This directory is searched (in
order) for either the file search-servers.txt, containing a list of
distributed search servers, or the directory "index" containing
merged indexes, or the directory "segments" containing segment
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Mon Jul 11 14:30:22 2005
@@ -100,6 +100,7 @@
writer.optimize();
writer.close();
fs.completeLocalOutput(perm, temp); // copy to ndfs
+ fs.createNewFile(new File(perm, IndexSegment.DONE_NAME));
}
};
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java Mon Jul 11 14:30:22 2005
@@ -1,5 +1,3 @@
-package org.apache.lucene.store;
-
/**
* Copyright 2004 The Apache Software Foundation
*
@@ -15,6 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+package org.apache.nutch.indexer;
import java.io.*;
import org.apache.lucene.store.*;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java Mon Jul 11 14:30:22 2005
@@ -39,57 +39,64 @@
private NutchFileSystem nfs;
private File segmentDir;
- private ArrayFile.Reader fetcher;
- private ArrayFile.Reader content;
- private ArrayFile.Reader text;
- private ArrayFile.Reader parsedata;
+ private MapFile.Reader[] content;
+ private MapFile.Reader[] parseText;
+ private MapFile.Reader[] parseData;
public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
this.nfs = nfs;
this.segmentDir = segmentDir;
}
- public FetcherOutput getFetcherOutput(int docNo) throws IOException {
- if (fetcher == null) {
- this.fetcher = new ArrayFile.Reader
- (nfs, new File(segmentDir, FetcherOutput.DIR_NAME).toString());
- }
-
- FetcherOutput entry = new FetcherOutput();
- fetcher.get(docNo, entry);
- return entry;
+ public FetcherOutput getFetcherOutput(UTF8 url) throws IOException {
+ throw new UnsupportedOperationException();
}
- public byte[] getContent(int docNo) throws IOException {
- if (content == null) {
- this.content = new ArrayFile.Reader
- (nfs, new File(segmentDir, Content.DIR_NAME).toString());
+ public byte[] getContent(UTF8 url) throws IOException {
+ synchronized (this) {
+ if (content == null) {
+ File[] parts = nfs.listFiles(new File(segmentDir, Content.DIR_NAME));
+ content = new MapFile.Reader[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ content[i] = new MapFile.Reader(nfs, parts[i].toString());
+ }
+ }
}
Content entry = new Content();
- content.get(docNo, entry);
+ content[url.hashCode()%content.length].get(url, entry);
return entry.getContent();
}
- public ParseData getParseData(int docNo) throws IOException {
- if (parsedata == null) {
- this.parsedata = new ArrayFile.Reader
- (nfs, new File(segmentDir, ParseData.DIR_NAME).toString());
+ public ParseData getParseData(UTF8 url) throws IOException {
+ synchronized (this) {
+ if (parseData == null) {
+ File[] parts=nfs.listFiles(new File(segmentDir, ParseData.DIR_NAME));
+ parseData = new MapFile.Reader[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ parseData[i] = new MapFile.Reader(nfs, parts[i].toString());
+ }
+ }
}
ParseData entry = new ParseData();
- parsedata.get(docNo, entry);
+ parseData[url.hashCode()%parseData.length].get(url, entry);
return entry;
}
- public ParseText getParseText(int docNo) throws IOException {
- if (text == null) {
- this.text = new ArrayFile.Reader
- (nfs, new File(segmentDir, ParseText.DIR_NAME).toString());
+ public ParseText getParseText(UTF8 url) throws IOException {
+ synchronized (this) {
+ if (parseText == null) {
+ File[] parts=nfs.listFiles(new File(segmentDir, ParseText.DIR_NAME));
+ parseText = new MapFile.Reader[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ parseText[i] = new MapFile.Reader(nfs, parts[i].toString());
+ }
+ }
}
-
+
ParseText entry = new ParseText();
- text.get(docNo, entry);
+ parseText[url.hashCode()%parseText.length].get(url, entry);
return entry;
}
@@ -104,10 +111,12 @@
if (segmentDirs != null) {
for (int i = 0; i < segmentDirs.length; i++) {
File segmentDir = segmentDirs[i];
- File indexdone = new File(segmentDir, IndexSegment.DONE_NAME);
- if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
- segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
- }
+// File indexdone = new File(segmentDir, IndexSegment.DONE_NAME);
+// if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
+// segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
+// }
+ segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
+
}
}
}
@@ -117,31 +126,31 @@
}
public byte[] getContent(HitDetails details) throws IOException {
- return getSegment(details).getContent(getDocNo(details));
+ return getSegment(details).getContent(getUrl(details));
}
public ParseData getParseData(HitDetails details) throws IOException {
- return getSegment(details).getParseData(getDocNo(details));
+ return getSegment(details).getParseData(getUrl(details));
}
public String[] getAnchors(HitDetails details) throws IOException {
- return getSegment(details).getFetcherOutput(getDocNo(details))
+ return getSegment(details).getFetcherOutput(getUrl(details))
.getFetchListEntry().getAnchors();
}
public long getFetchDate(HitDetails details) throws IOException {
- return getSegment(details).getFetcherOutput(getDocNo(details))
+ return getSegment(details).getFetcherOutput(getUrl(details))
.getFetchDate();
}
public ParseText getParseText(HitDetails details) throws IOException {
- return getSegment(details).getParseText(getDocNo(details));
+ return getSegment(details).getParseText(getUrl(details));
}
public String getSummary(HitDetails details, Query query)
throws IOException {
- String text = getSegment(details).getParseText(getDocNo(details)).getText();
+ String text = getSegment(details).getParseText(getUrl(details)).getText();
return new Summarizer().getSummary(text, query).toString();
}
@@ -199,8 +208,8 @@
return (Segment)segments.get(details.getValue("segment"));
}
- private int getDocNo(HitDetails details) {
- return Integer.parseInt(details.getValue("docNo"), 16);
+ private UTF8 getUrl(HitDetails details) {
+ return new UTF8(details.getValue("url"));
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java Mon Jul 11 14:30:22 2005
@@ -47,12 +47,12 @@
}
/** Construct minimal details from a segment name and document number. */
- public HitDetails(String segment, String docNo) {
+ public HitDetails(String segment, String url) {
this(new String[2], new String[2]);
this.fields[0] = "segment";
this.values[0] = segment;
- this.fields[1] = "docNo";
- this.values[1] = docNo;
+ this.fields[1] = "url";
+ this.values[1] = url;
}
/** Returns the number of fields contained in this. */
@@ -102,7 +102,7 @@
/** Display as a string. */
public String toString() {
- return getValue("segment") + "/" + getValue("docNo");
+ return getValue("segment") + "/" + getValue("url");
}
/** Display as HTML. */
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java Mon Jul 11 14:30:22 2005
@@ -22,6 +22,9 @@
import java.util.ArrayList;
import java.util.Enumeration;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
@@ -34,6 +37,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.nutch.fs.*;
import org.apache.nutch.io.*;
import org.apache.nutch.util.*;
import org.apache.nutch.db.*;
@@ -43,7 +47,7 @@
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
/** Implements {@link Searcher} and {@link HitDetailer} for either a single
- * merged index, or for a set of individual segment indexes. */
+ * merged index, or a set of indexes. */
public class IndexSearcher implements Searcher, HitDetailer {
private org.apache.lucene.search.Searcher luceneSearcher;
@@ -53,26 +57,34 @@
(NutchConf.get().getInt("searcher.filter.cache.size", 16),
NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
- /** Construct given a number of indexed segments. */
- public IndexSearcher(File[] segmentDirs) throws IOException {
- IndexReader[] readers = new IndexReader[segmentDirs.length];
- for (int i = 0; i < segmentDirs.length; i++) {
- readers[i] = IndexReader.open(new File(segmentDirs[i], "index"));
+ /** Construct given a number of indexes. */
+ public IndexSearcher(File[] indexDirs) throws IOException {
+ IndexReader[] readers = new IndexReader[indexDirs.length];
+ for (int i = 0; i < indexDirs.length; i++) {
+ readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
}
init(new MultiReader(readers));
}
- /** Construct given a directory containing fetched segments, and a separate
- * directory naming their merged index. */
- public IndexSearcher(String index)
+ /** Construct given a single merged index. */
+ public IndexSearcher(File index)
throws IOException {
- init(IndexReader.open(index));
+ init(IndexReader.open(getDirectory(index)));
}
private void init(IndexReader reader) throws IOException {
this.reader = reader;
this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
+ }
+
+ private Directory getDirectory(File file) throws IOException {
+ NutchFileSystem fs = NutchFileSystem.get();
+ if ("local".equals(fs.getName())) {
+ return FSDirectory.getDirectory(file, false);
+ } else {
+ return new NdfsDirectory(fs, file, false);
+ }
}
public Hits search(Query query, int numHits,
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java Mon Jul 11 14:30:22 2005
@@ -42,6 +42,8 @@
LogFormatter.setShowThreadIDs(true);
}
+ private NutchFileSystem fs = NutchFileSystem.get();
+
private String[] segmentNames;
private Searcher searcher;
@@ -69,33 +71,36 @@
/** Construct reading from connected directory. */
public NutchBean() throws IOException {
- this(new File(NutchConf.get().get("searcher.dir", ".")));
+ this(new File(NutchConf.get().get("searcher.dir", "crawl")));
}
/** Construct in a named directory. */
public NutchBean(File dir) throws IOException {
File servers = new File(dir, "search-servers.txt");
- if (servers.exists()) {
+ if (fs.exists(servers)) {
LOG.info("searching servers in " + servers.getCanonicalPath());
init(new DistributedSearch.Client(servers));
} else {
- init(new File(dir, "index"), new File(dir, "segments"));
+ init(new File(dir, "index"),
+ new File(dir, "indexes"),
+ new File(dir, "segments"));
}
}
- private void init(File indexDir, File segmentsDir) throws IOException {
+ private void init(File indexDir, File indexesDir, File segmentsDir)
+ throws IOException {
IndexSearcher indexSearcher;
- if (indexDir.exists()) {
- LOG.info("opening merged index in " + indexDir.getCanonicalPath());
- indexSearcher = new IndexSearcher(indexDir.getCanonicalPath());
+ if (fs.exists(indexDir)) {
+ LOG.info("opening merged index in " + indexDir);
+ indexSearcher = new IndexSearcher(indexDir);
} else {
- LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
+ LOG.info("opening indexes in " + indexesDir);
Vector vDirs=new Vector();
- File [] directories = segmentsDir.listFiles();
- for(int i = 0; i < segmentsDir.listFiles().length; i++) {
+ File [] directories = fs.listFiles(indexesDir);
+ for(int i = 0; i < fs.listFiles(indexesDir).length; i++) {
File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
- if(indexdone.exists() && indexdone.isFile()) {
+ if(fs.isFile(indexdone)) {
vDirs.add(directories[i]);
}
}
@@ -108,7 +113,8 @@
indexSearcher = new IndexSearcher(directories);
}
- FetchedSegments segments = new FetchedSegments(new LocalFileSystem(), segmentsDir.toString());
+ LOG.info("opening segments in " + segmentsDir);
+ FetchedSegments segments = new FetchedSegments(fs, segmentsDir.toString());
this.segmentNames = segments.getSegmentNames();
@@ -338,7 +344,7 @@
String[] summaries = bean.getSummary(details, query);
for (int i = 0; i < hits.getLength(); i++) {
- System.out.println(" "+i+" "+ details[i]);// + "\n" + summaries[i]);
+ System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
}
}