You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/02 14:38:51 UTC

svn commit: r439582 - in /lucene/nutch/trunk/src/test: crawl-tests.xml org/apache/nutch/crawl/CrawlDBTestUtil.java org/apache/nutch/crawl/TestGenerator.java org/apache/nutch/crawl/TestInjector.java

Author: siren
Date: Sat Sep  2 05:38:50 2006
New Revision: 439582

URL: http://svn.apache.org/viewvc?rev=439582&view=rev
Log:
Add simple unit tests for injector and generator

Added:
    lucene/nutch/trunk/src/test/crawl-tests.xml
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java

Added: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (added)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sat Sep  2 05:38:50 2006
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<!-- Configuration overrides used during unit tests. -->
+
+<configuration>
+
+<property>
+  <name>plugin.includes</name>
+  <value>urlfilter-suffix|scoring-opic</value>
+  <description>Enable required plugins.</description>
+</property>
+
+</configuration>
\ No newline at end of file

Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Sat Sep  2 05:38:50 2006
@@ -0,0 +1,69 @@
+package org.apache.nutch.crawl;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.UTF8;
+
+public class CrawlDBTestUtil {
+
+  private static final Log LOG = LogFactory.getLog(CrawlDBTestUtil.class);
+
+  /**
+   * Creates synthetic crawldb
+   * 
+   * @param fs
+   *          filesystem where db will be created
+   * @param crawldb
+   *          path were db will be created
+   * @param init
+   *          urls to be inserted, objects are of type URLCrawlDatum
+   * @throws Exception
+   */
+  public static void createCrawlDb(FileSystem fs, Path crawldb, List<URLCrawlDatum> init)
+      throws Exception {
+    LOG.trace("* creating crawldb: " + crawldb);
+    Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
+    MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000")
+        .toString(), UTF8.class, CrawlDatum.class);
+    Iterator<URLCrawlDatum> it = init.iterator();
+    while (it.hasNext()) {
+      URLCrawlDatum row = it.next();
+      LOG.info("adding:" + row.url.toString());
+      writer.append(new UTF8(row.url), row.datum);
+    }
+    writer.close();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use dynamically
+   * set values.
+   * 
+   * @return
+   */
+  public static Configuration create(){
+    Configuration conf=new Configuration();
+    conf.addDefaultResource("nutch-default.xml");
+    conf.addFinalResource("crawl-tests.xml");
+    return conf;
+  }
+
+  public static class URLCrawlDatum {
+
+    UTF8 url;
+
+    CrawlDatum datum;
+
+    public URLCrawlDatum(UTF8 url, CrawlDatum datum) {
+      this.url = url;
+      this.datum = datum;
+    }
+  }
+}

Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sat Sep  2 05:38:50 2006
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic generator test:
+ * 1. Insert entries in crawldb
+ * 2. Generates entries to fetch
+ * 3. Verifies that number of generated urls match
+ * 4. Verifies that highest scoring urls are generated 
+ 
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ *
+ */
+public class TestGenerator extends TestCase {
+
+  Configuration conf;
+
+  Path dbDir;
+
+  Path segmentsDir;
+
+  FileSystem fs;
+
+  protected void setUp() throws Exception {
+    conf = CrawlDBTestUtil.create();
+  }
+
+  protected void tearDown() {
+    delete(dbDir);
+    delete(segmentsDir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test that generator generates fetchlish ordered by score (desc)
+   * 
+   * @throws Exception
+   */
+  public void testGenerateHighest() throws Exception {
+
+    int NUM_RESULTS=2;
+    
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+    
+    for(int i=0;i<=100;i++){
+      list.add(new CrawlDBTestUtil.URLCrawlDatum(new UTF8("http://aaa/" + pad(i)),
+        new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
+    }
+    
+    fs = FileSystem.get(conf);
+    dbDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
+    segmentsDir = new Path("test-crawldb-segments" + new java.util.Random().nextInt());
+    fs.mkdirs(dbDir);
+    fs.mkdirs(segmentsDir);
+    
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(fs, dbDir, list);
+    
+    // generate segment
+    Generator g=new Generator(conf);
+    Path generatedSegment=g.generate(dbDir, segmentsDir,0,NUM_RESULTS, Long.MAX_VALUE);
+    
+    Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME),"part-00000");
+    
+    // verify results
+    SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf);
+    
+    ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>();
+    
+    READ:
+      do {
+      UTF8 key=new UTF8();
+      CrawlDatum value=new CrawlDatum();
+      if(!reader.next(key, value)) break READ;
+      l.add(new URLCrawlDatum(key, value));
+    } while(true);
+
+    reader.close();
+
+    // sort urls by score desc
+    Collections.sort(l, new ScoreComparator());
+
+    //verify we got right amount of records
+    assertEquals(NUM_RESULTS, l.size());
+
+    //verify we have the highest scoring urls
+    assertEquals("http://aaa/100", (l.get(0).url.toString()));
+    assertEquals("http://aaa/099", (l.get(1).url.toString()));
+  }
+
+  private String pad(int i) {
+    String s=Integer.toString(i);
+    while(s.length()<3)
+      s="0" + s;
+    return s;
+  }
+
+  /**
+   * Comparator that sorts by score desc
+   */
+  public class ScoreComparator implements Comparator<URLCrawlDatum> {
+
+    public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
+
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0)
+        return -1;
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0)
+        return 1;
+
+      return 0;
+    }
+  }
+}

Added: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=439582&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Sat Sep  2 05:38:50 2006
@@ -0,0 +1,128 @@
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+
+import junit.framework.TestCase;
+
+/**
+ * Basic injector test:
+ * 1. Creates a text file with urls
+ * 2. Injects them into crawldb
+ * 3. Reads crawldb entries and verifies contents
+ * 4. Injects more urls into webdb
+ * 5. Reads crawldb entries and verifies contents
+ * 
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
+ */
+public class TestInjector extends TestCase {
+
+  private FSDataOutputStream out;
+  private Configuration conf;
+  private FileSystem fs;
+  final static Path testdir=new Path("build/test/inject-test");
+  Path crawldbPath;
+  Path urlPath;
+  
+  protected void setUp() throws Exception {
+    conf = CrawlDBTestUtil.create();
+    urlPath=new Path(testdir,"urls");
+    crawldbPath=new Path(testdir,"crawldb");
+    fs=FileSystem.get(conf);
+    
+  }
+  
+  protected void tearDown() throws IOException{
+    fs.delete(testdir);
+  }
+
+  public void testInject() throws IOException {
+    ArrayList<String> urls=new ArrayList<String>();
+    for(int i=0;i<100;i++) {
+      urls.add("http://zzz/" + i + ".html");
+    }
+    generateSeedList(urls);
+    
+    Injector injector=new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+    
+    // verify results
+    List<String>read=readCrawldb();
+    
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    assertEquals(urls.size(), read.size());
+    
+    assertTrue(read.containsAll(urls));
+    assertTrue(urls.containsAll(read));
+    
+    //inject more urls
+    ArrayList<String> urls2=new ArrayList<String>();
+    for(int i=0;i<100;i++) {
+      urls2.add("http://xxx/" + i + ".html");
+    }
+    generateSeedList(urls2);
+    injector.inject(crawldbPath, urlPath);
+    urls.addAll(urls2);
+    
+    // verify results
+    read=readCrawldb();
+    
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    assertEquals(urls.size(), read.size());
+    
+    assertTrue(read.containsAll(urls));
+    assertTrue(urls.containsAll(read));
+    
+  }
+  
+  /**
+   * Generate seedlist
+   * @throws IOException 
+   */
+  private void generateSeedList(List<String> contents) throws IOException{
+    Path file=new Path(urlPath,"urls.txt");
+    fs.mkdirs(urlPath);
+    out=fs.create(file);
+    Iterator<String> iterator=contents.iterator();
+    while(iterator.hasNext()){
+      String url=iterator.next();
+      out.writeBytes(url);
+      out.writeBytes("\n");
+    }
+    out.flush();
+    out.close();
+  }
+  
+  private List<String> readCrawldb() throws IOException{
+    Path dbfile=new Path(crawldbPath,CrawlDatum.DB_DIR_NAME + "/part-00000/data");
+    System.out.println("reading:" + dbfile);
+    SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
+    ArrayList<String> read=new ArrayList<String>();
+    
+    READ:
+      do {
+      UTF8 key=new UTF8();
+      CrawlDatum value=new CrawlDatum();
+      if(!reader.next(key, value)) break READ;
+      read.add(key.toString());
+    } while(true);
+
+    return read;
+  }
+
+}