You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/11 23:30:25 UTC

svn commit: r213607 - in /lucene/nutch/branches/mapred: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/searcher/

Author: cutting
Date: Mon Jul 11 14:30:22 2005
New Revision: 213607

URL: http://svn.apache.org/viewcvs?rev=213607&view=rev
Log:
Get search working on NDFS-resident, MapReduce-created crawl.

Modified:
    lucene/nutch/branches/mapred/build.xml
    lucene/nutch/branches/mapred/conf/nutch-default.xml
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java

Modified: lucene/nutch/branches/mapred/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/build.xml (original)
+++ lucene/nutch/branches/mapred/build.xml Mon Jul 11 14:30:22 2005
@@ -119,7 +119,7 @@
   <!-- ================================================================== -->
   <!--                                                                    -->
   <!-- ================================================================== -->
-  <target name="war" depends="compile,generate-docs">
+  <target name="war" depends="jar,compile,generate-docs">
     <war destfile="${build.dir}/${final.name}.war"
 	 webxml="${web.src.dir}/web.xml">
       <fileset dir="${web.src.dir}/jsp"/>

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 11 14:30:22 2005
@@ -498,9 +498,9 @@
 
 <property>
   <name>searcher.dir</name>
-  <value>.</value>
+  <value>crawl</value>
   <description>
-  Path to root of index directories.  This directory is searched (in
+  Path to root of crawl.  This directory is searched (in
   order) for either the file search-servers.txt, containing a list of
   distributed search servers, or the directory "index" containing
   merged indexes, or the directory "segments" containing segment

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Mon Jul 11 14:30:22 2005
@@ -100,6 +100,7 @@
             writer.optimize();
             writer.close();
             fs.completeLocalOutput(perm, temp);   // copy to ndfs
+            fs.createNewFile(new File(perm, IndexSegment.DONE_NAME));
           }
         };
     }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java Mon Jul 11 14:30:22 2005
@@ -1,5 +1,3 @@
-package org.apache.lucene.store;
-
 /**
  * Copyright 2004 The Apache Software Foundation
  *
@@ -15,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+package org.apache.nutch.indexer;
 
 import java.io.*;
 import org.apache.lucene.store.*;

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java Mon Jul 11 14:30:22 2005
@@ -39,57 +39,64 @@
     private NutchFileSystem nfs;
     private File segmentDir;
 
-    private ArrayFile.Reader fetcher;
-    private ArrayFile.Reader content;
-    private ArrayFile.Reader text;
-    private ArrayFile.Reader parsedata;
+    private MapFile.Reader[] content;
+    private MapFile.Reader[] parseText;
+    private MapFile.Reader[] parseData;
 
     public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
       this.nfs = nfs;
       this.segmentDir = segmentDir;
     }
 
-    public FetcherOutput getFetcherOutput(int docNo) throws IOException {
-      if (fetcher == null) { 
-        this.fetcher = new ArrayFile.Reader
-          (nfs, new File(segmentDir, FetcherOutput.DIR_NAME).toString());
-      }
-
-      FetcherOutput entry = new FetcherOutput();
-      fetcher.get(docNo, entry);
-      return entry;
+    public FetcherOutput getFetcherOutput(UTF8 url) throws IOException {
+      throw new UnsupportedOperationException();
     }
 
-    public byte[] getContent(int docNo) throws IOException {
-      if (content == null) {
-        this.content = new ArrayFile.Reader
-          (nfs, new File(segmentDir, Content.DIR_NAME).toString());
+    public byte[] getContent(UTF8 url) throws IOException {
+      synchronized (this) {
+        if (content == null) {
+          File[] parts = nfs.listFiles(new File(segmentDir, Content.DIR_NAME));
+          content = new MapFile.Reader[parts.length];
+          for (int i = 0; i < parts.length; i++) {
+            content[i] = new MapFile.Reader(nfs, parts[i].toString());
+          }
+        }
       }
 
       Content entry = new Content();
-      content.get(docNo, entry);
+      content[url.hashCode()%content.length].get(url, entry);
       return entry.getContent();
     }
 
-    public ParseData getParseData(int docNo) throws IOException {
-      if (parsedata == null) {
-        this.parsedata = new ArrayFile.Reader
-          (nfs, new File(segmentDir, ParseData.DIR_NAME).toString());
+    public ParseData getParseData(UTF8 url) throws IOException {
+      synchronized (this) {
+        if (parseData == null) {
+          File[] parts=nfs.listFiles(new File(segmentDir, ParseData.DIR_NAME));
+          parseData = new MapFile.Reader[parts.length];
+          for (int i = 0; i < parts.length; i++) {
+            parseData[i] = new MapFile.Reader(nfs, parts[i].toString());
+          }
+        }
       }
       
       ParseData entry = new ParseData();
-      parsedata.get(docNo, entry);
+      parseData[url.hashCode()%parseData.length].get(url, entry);
       return entry;
     }
 
-    public ParseText getParseText(int docNo) throws IOException {
-      if (text == null) {
-        this.text = new ArrayFile.Reader
-          (nfs, new File(segmentDir, ParseText.DIR_NAME).toString());
+    public ParseText getParseText(UTF8 url) throws IOException {
+      synchronized (this) {
+        if (parseText == null) {
+          File[] parts=nfs.listFiles(new File(segmentDir, ParseText.DIR_NAME));
+          parseText = new MapFile.Reader[parts.length];
+          for (int i = 0; i < parts.length; i++) {
+            parseText[i] = new MapFile.Reader(nfs, parts[i].toString());
+          }
+        }
       }
-
+      
       ParseText entry = new ParseText();
-      text.get(docNo, entry);
+      parseText[url.hashCode()%parseText.length].get(url, entry);
       return entry;
     }
     
@@ -104,10 +111,12 @@
     if (segmentDirs != null) {
         for (int i = 0; i < segmentDirs.length; i++) {
             File segmentDir = segmentDirs[i];
-            File indexdone = new File(segmentDir, IndexSegment.DONE_NAME);
-            if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
-            	segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
-            }
+//             File indexdone = new File(segmentDir, IndexSegment.DONE_NAME);
+//             if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
+//             	segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
+//             }
+            segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
+
         }
     }
   }
@@ -117,31 +126,31 @@
   }
 
   public byte[] getContent(HitDetails details) throws IOException {
-    return getSegment(details).getContent(getDocNo(details));
+    return getSegment(details).getContent(getUrl(details));
   }
 
   public ParseData getParseData(HitDetails details) throws IOException {
-    return getSegment(details).getParseData(getDocNo(details));
+    return getSegment(details).getParseData(getUrl(details));
   }
 
   public String[] getAnchors(HitDetails details) throws IOException {
-    return getSegment(details).getFetcherOutput(getDocNo(details))
+    return getSegment(details).getFetcherOutput(getUrl(details))
       .getFetchListEntry().getAnchors();
   }
 
   public long getFetchDate(HitDetails details) throws IOException {
-    return getSegment(details).getFetcherOutput(getDocNo(details))
+    return getSegment(details).getFetcherOutput(getUrl(details))
       .getFetchDate();
   }
 
   public ParseText getParseText(HitDetails details) throws IOException {
-    return getSegment(details).getParseText(getDocNo(details));
+    return getSegment(details).getParseText(getUrl(details));
   }
 
   public String getSummary(HitDetails details, Query query)
     throws IOException {
 
-    String text = getSegment(details).getParseText(getDocNo(details)).getText();
+    String text = getSegment(details).getParseText(getUrl(details)).getText();
 
     return new Summarizer().getSummary(text, query).toString();
   }
@@ -199,8 +208,8 @@
     return (Segment)segments.get(details.getValue("segment"));
   }
 
-  private int getDocNo(HitDetails details) {
-    return Integer.parseInt(details.getValue("docNo"), 16);
+  private UTF8 getUrl(HitDetails details) {
+    return new UTF8(details.getValue("url"));
   }
 
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java Mon Jul 11 14:30:22 2005
@@ -47,12 +47,12 @@
   }
 
   /** Construct minimal details from a segment name and document number. */
-  public HitDetails(String segment, String docNo) {
+  public HitDetails(String segment, String url) {
     this(new String[2], new String[2]);
     this.fields[0] = "segment";
     this.values[0] = segment;
-    this.fields[1] = "docNo";
-    this.values[1] = docNo;
+    this.fields[1] = "url";
+    this.values[1] = url;
   }
 
   /** Returns the number of fields contained in this. */
@@ -102,7 +102,7 @@
 
   /** Display as a string. */
   public String toString() {
-    return getValue("segment") + "/" + getValue("docNo");
+    return getValue("segment") + "/" + getValue("url");
   }
 
   /** Display as HTML. */

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java Mon Jul 11 14:30:22 2005
@@ -22,6 +22,9 @@
 import java.util.ArrayList;
 import java.util.Enumeration;
 
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 
@@ -34,6 +37,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
+import org.apache.nutch.fs.*;
 import org.apache.nutch.io.*;
 import org.apache.nutch.util.*;
 import org.apache.nutch.db.*;
@@ -43,7 +47,7 @@
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 
 /** Implements {@link Searcher} and {@link HitDetailer} for either a single
- * merged index, or for a set of individual segment indexes. */
+ * merged index, or a set of indexes. */
 public class IndexSearcher implements Searcher, HitDetailer {
 
   private org.apache.lucene.search.Searcher luceneSearcher;
@@ -53,26 +57,34 @@
     (NutchConf.get().getInt("searcher.filter.cache.size", 16),
      NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
 
-  /** Construct given a number of indexed segments. */
-  public IndexSearcher(File[] segmentDirs) throws IOException {
-    IndexReader[] readers = new IndexReader[segmentDirs.length];
-    for (int i = 0; i < segmentDirs.length; i++) {
-      readers[i] = IndexReader.open(new File(segmentDirs[i], "index"));
+  /** Construct given a number of indexes. */
+  public IndexSearcher(File[] indexDirs) throws IOException {
+    IndexReader[] readers = new IndexReader[indexDirs.length];
+    for (int i = 0; i < indexDirs.length; i++) {
+      readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
     }
     init(new MultiReader(readers));
   }
 
-  /** Construct given a directory containing fetched segments, and a separate
-   * directory naming their merged index. */
-  public IndexSearcher(String index)
+  /** Construct given a single merged index. */
+  public IndexSearcher(File index)
     throws IOException {
-    init(IndexReader.open(index));
+    init(IndexReader.open(getDirectory(index)));
   }
 
   private void init(IndexReader reader) throws IOException {
     this.reader = reader;
     this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
     this.luceneSearcher.setSimilarity(new NutchSimilarity());
+  }
+
+  private Directory getDirectory(File file) throws IOException {
+    NutchFileSystem fs = NutchFileSystem.get();
+    if ("local".equals(fs.getName())) {
+      return FSDirectory.getDirectory(file, false);
+    } else {
+      return new NdfsDirectory(fs, file, false);
+    }
   }
 
   public Hits search(Query query, int numHits,

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java?rev=213607&r1=213606&r2=213607&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java Mon Jul 11 14:30:22 2005
@@ -42,6 +42,8 @@
     LogFormatter.setShowThreadIDs(true);
   }
 
+  private NutchFileSystem fs = NutchFileSystem.get();
+
   private String[] segmentNames;
 
   private Searcher searcher;
@@ -69,33 +71,36 @@
 
   /** Construct reading from connected directory. */
   public NutchBean() throws IOException {
-    this(new File(NutchConf.get().get("searcher.dir", ".")));
+    this(new File(NutchConf.get().get("searcher.dir", "crawl")));
   }
 
   /** Construct in a named directory. */
   public NutchBean(File dir) throws IOException {
     File servers = new File(dir, "search-servers.txt");
-    if (servers.exists()) {
+    if (fs.exists(servers)) {
       LOG.info("searching servers in " + servers.getCanonicalPath());
       init(new DistributedSearch.Client(servers));
     } else {
-      init(new File(dir, "index"), new File(dir, "segments"));
+      init(new File(dir, "index"),
+           new File(dir, "indexes"),
+           new File(dir, "segments"));
     }
   }
 
-  private void init(File indexDir, File segmentsDir) throws IOException {
+  private void init(File indexDir, File indexesDir, File segmentsDir)
+    throws IOException {
     IndexSearcher indexSearcher;
-    if (indexDir.exists()) {
-      LOG.info("opening merged index in " + indexDir.getCanonicalPath());
-      indexSearcher = new IndexSearcher(indexDir.getCanonicalPath());
+    if (fs.exists(indexDir)) {
+      LOG.info("opening merged index in " + indexDir);
+      indexSearcher = new IndexSearcher(indexDir);
     } else {
-      LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
+      LOG.info("opening indexes in " + indexesDir);
       
       Vector vDirs=new Vector();
-      File [] directories = segmentsDir.listFiles();
-      for(int i = 0; i < segmentsDir.listFiles().length; i++) {
+      File [] directories = fs.listFiles(indexesDir);
+      for(int i = 0; i < fs.listFiles(indexesDir).length; i++) {
         File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
-        if(indexdone.exists() && indexdone.isFile()) {
+        if(fs.isFile(indexdone)) {
           vDirs.add(directories[i]);
         }
       }
@@ -108,7 +113,8 @@
       indexSearcher = new IndexSearcher(directories);
     }
 
-    FetchedSegments segments = new FetchedSegments(new LocalFileSystem(), segmentsDir.toString());
+    LOG.info("opening segments in " + segmentsDir);
+    FetchedSegments segments = new FetchedSegments(fs, segmentsDir.toString());
     
     this.segmentNames = segments.getSegmentNames();
     
@@ -338,7 +344,7 @@
     String[] summaries = bean.getSummary(details, query);
 
     for (int i = 0; i < hits.getLength(); i++) {
-      System.out.println(" "+i+" "+ details[i]);// + "\n" + summaries[i]);
+      System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
     }
   }