You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/15 10:10:09 UTC

svn commit: r1594812 [5/5] - in /nutch/branches/2.x: ./ ivy/ src/gora/ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/host/ src/java/org/apache/nutch/indexer/ src/java/org/ap...

Modified: nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Thu May 15 08:10:07 2014
@@ -17,17 +17,7 @@
 
 package org.apache.nutch.scoring.opic;
 
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
 import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.scoring.ScoreDatum;
@@ -35,6 +25,16 @@ import org.apache.nutch.scoring.ScoringF
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
 
 /**
  * This plugin implements a variant of an Online Page Importance Computation
@@ -82,7 +82,7 @@ public class OPICScoringFilter implement
   public void injectedScore(String url, WebPage row)
   throws ScoringFilterException {
     float score = row.getScore();
-    row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(score)));
+    row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(score)));
   }
 
   /** Set to 0.0f (unknown value) - inlink contributions will bring it to
@@ -90,7 +90,7 @@ public class OPICScoringFilter implement
   @Override
   public void initialScore(String url, WebPage row) throws ScoringFilterException {
     row.setScore(0.0f);
-    row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f)));
+    row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f)));
   }
 
   /** Use {@link WebPage#getScore()}. */
@@ -108,12 +108,12 @@ public class OPICScoringFilter implement
     }
     float oldScore = row.getScore();
     row.setScore(oldScore + adjust);
-    ByteBuffer cashRaw = row.getFromMetadata(CASH_KEY);
+    ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY);
     float cash = 0.0f;
     if (cashRaw != null) {
       cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position());
     }
-    row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + adjust)));
+    row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + adjust)));
   }
 
   /** Get cash on hand, divide it by the number of outlinks and apply. */
@@ -121,7 +121,7 @@ public class OPICScoringFilter implement
   public void distributeScoreToOutlinks(String fromUrl,
       WebPage row, Collection<ScoreDatum> scoreData,
       int allCount) {
-    ByteBuffer cashRaw = row.getFromMetadata(CASH_KEY);
+    ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY);
     if (cashRaw == null) {
       return;
     }
@@ -149,7 +149,7 @@ public class OPICScoringFilter implement
       }
     }
     // reset cash to zero
-    row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f)));
+    row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f)));
   }
 
   /** Dampen the boost value by scorePower.*/

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java Thu May 15 08:10:07 2014
@@ -87,7 +87,7 @@ public class TestAdaptiveFetchSchedule e
     wp.setStatus(1);
     wp.setFetchInterval(interval);
     wp.setScore(1.0f);
-    wp.setFetchTime(0);
+    wp.setFetchTime(0L);
     return wp;
   }
 

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Thu May 15 08:10:07 2014
@@ -16,13 +16,6 @@
  */
 package org.apache.nutch.crawl;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
@@ -33,7 +26,15 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
-import static org.junit.Assert.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import static org.junit.Assert.assertEquals;
 
 /**
  * Basic generator test. 1. Insert entries in webtable 2. Generates entries to
@@ -294,10 +295,10 @@ public class TestGenerator extends Abstr
    */
   private URLWebPage createURLWebPage(final String url,
       final int fetchInterval, final float score) {
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setFetchInterval(fetchInterval);
     page.setScore(score);
-    page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+    page.setStatus((int)CrawlStatus.STATUS_UNFETCHED);
     return new URLWebPage(url, page);
   }
 

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Thu May 15 08:10:07 2014
@@ -16,11 +16,6 @@
  */
 package org.apache.nutch.crawl;
 
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.fs.Path;
 import org.apache.nutch.storage.WebPage;
@@ -30,7 +25,14 @@ import org.apache.nutch.util.CrawlTestUt
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
-import static org.junit.Assert.*;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 /**
  * Basic injector test: 1. Creates a text file with urls 2. Injects them into
@@ -110,8 +112,8 @@ public class TestInjector extends Abstra
     for (URLWebPage up : pages) {
       WebPage page = up.getDatum();
       String representation = up.getUrl();
-      representation += "\tnutch.score=" + (int)page.getScore();
-      ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
+      representation += "\tnutch.score=" + page.getScore().intValue();
+      ByteBuffer bb = page.getMetadata().get(new Utf8("custom.attribute"));
       if (bb != null) {
         representation += "\tcustom.attribute=" + Bytes.toString(bb);
       }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java Thu May 15 08:10:07 2014
@@ -16,11 +16,6 @@
  ******************************************************************************/
 package org.apache.nutch.crawl;
 
-import java.net.MalformedURLException;
-
-import org.junit.Test;
-import static org.junit.Assert.*;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
@@ -30,6 +25,12 @@ import org.apache.nutch.fetcher.FetchEnt
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TableUtil;
+import org.junit.Test;
+
+import java.net.MalformedURLException;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
 
 /**
  * Tests {@link URLPartitioner}
@@ -168,7 +169,7 @@ public class TestURLPartitioner {
     int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks);
     //init selector entry (score shouldn't matter)
     SelectorEntry selectorEntry = new SelectorEntry("http://www.example.org/", 1337);
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     int partitionFromSig = sigPartitioner.getPartition(selectorEntry, page, numReduceTasks);
     
     assertEquals("partitions should be same", 
@@ -199,7 +200,7 @@ public class TestURLPartitioner {
     
     int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks);
     IntWritable intWritable = new IntWritable(1337); //doesn't matter
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     String key = TableUtil.reverseUrl("http://www.example.org/");
     FetchEntry fetchEntry = new FetchEntry(conf, key, page);
     int partitionFromSig = sigPartitioner.getPartition(intWritable, fetchEntry, numReduceTasks);

Modified: nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Thu May 15 08:10:07 2014
@@ -16,14 +16,15 @@
  */
 package org.apache.nutch.indexer;
 
-import org.junit.Test;
-import static org.junit.Assert.*;
-
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
 
 public class TestIndexingFilters {
 
@@ -42,7 +43,7 @@ public class TestIndexingFilters {
     conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
 
     IndexingFilters filters = new IndexingFilters(conf);
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setText(new Utf8("text"));
     page.setTitle(new Utf8("title"));
     filters.filter(new NutchDocument(),"http://www.example.com/",page);
@@ -59,7 +60,7 @@ public class TestIndexingFilters {
     conf.addResource("crawl-tests.xml");
 
     IndexingFilters filters = new IndexingFilters(conf);
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setText(new Utf8("text"));
     page.setTitle(new Utf8("title"));
     NutchDocument doc = filters.filter(null,"http://www.example.com/",page);
@@ -82,7 +83,7 @@ public class TestIndexingFilters {
     conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
 
     IndexingFilters filters1 = new IndexingFilters(conf);
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setText(new Utf8("text"));
     page.setTitle(new Utf8("title"));
     NutchDocument fdoc1 = filters1.filter(new NutchDocument(),"http://www.example.com/",page);

Modified: nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java Thu May 15 08:10:07 2014
@@ -16,16 +16,6 @@
  ******************************************************************************/
 package org.apache.nutch.storage;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
 import org.apache.avro.util.Utf8;
 import org.apache.commons.io.IOUtils;
 import org.apache.gora.query.Result;
@@ -34,12 +24,23 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.nutch.util.AbstractNutchTest;
 import org.apache.nutch.util.CrawlTestUtil;
 import org.hsqldb.Server;
-import org.junit.Ignore;
-
 import org.junit.After;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Test;
-import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 
 /**
  * Tests basic Gora functionality by writing and reading webpages.
@@ -71,7 +72,7 @@ public class TestGoraStorage extends Abs
 
   private static void readWrite(String id, DataStore<String, WebPage> store) 
       throws IOException, Exception {
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     int max = 1000;
     for (int i = 0; i < max; i++) {
       // store a page with title

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java Thu May 15 08:10:07 2014
@@ -16,14 +16,9 @@
  */
 package org.apache.nutch.util;
 
-import java.io.IOException;
-import java.net.UnknownHostException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.gora.query.Query;
+import org.apache.gora.query.Result;
+import org.apache.gora.store.DataStore;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -31,14 +26,19 @@ import org.apache.hadoop.fs.Path;
 import org.apache.nutch.crawl.URLWebPage;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
-import org.apache.gora.query.Query;
-import org.apache.gora.query.Result;
-import org.apache.gora.store.DataStore;
 import org.mortbay.jetty.Handler;
 import org.mortbay.jetty.Server;
 import org.mortbay.jetty.handler.DefaultHandler;
 import org.mortbay.jetty.handler.HandlerList;
 import org.mortbay.jetty.handler.ResourceHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
 
 public class CrawlTestUtil {
 
@@ -121,7 +121,7 @@ public class CrawlTestUtil {
         if (requiredMark != null && requiredMark.checkMark(page) == null)
           continue;
 
-        l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
+        l.add(new URLWebPage(TableUtil.unreverseUrl(url), WebPage.newBuilder(page).build()));
       } catch (Exception e) {
         e.printStackTrace();
       }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java Thu May 15 08:10:07 2014
@@ -16,16 +16,16 @@
  */
 package org.apache.nutch.util;
 
-import java.io.UnsupportedEncodingException;
-import java.nio.ByteBuffer;
-
-import org.junit.Test;
-import static org.junit.Assert.*;
-
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.storage.WebPage;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+
+import static org.junit.Assert.assertEquals;
 
 public class TestEncodingDetector {
   private static Configuration conf = NutchConfiguration.create();
@@ -50,7 +50,7 @@ public class TestEncodingDetector {
     // Content content;
     String encoding;
 
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8("http://www.example.com/"));
     page.setContentType(new Utf8("text/plain"));
     page.setContent(ByteBuffer.wrap(contentInOctets));
@@ -61,18 +61,18 @@ public class TestEncodingDetector {
     // no information is available, so it should return default encoding
     assertEquals("windows-1252", encoding.toLowerCase());
 
-    page = new WebPage();
+    page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8("http://www.example.com/"));
     page.setContentType(new Utf8("text/plain"));
     page.setContent(ByteBuffer.wrap(contentInOctets));
-    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
+    page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
     
     detector = new EncodingDetector(conf);
     detector.autoDetectClues(page, true);
     encoding = detector.guessEncoding(page, "windows-1252");
     assertEquals("utf-16", encoding.toLowerCase());
 
-    page = new WebPage();
+    page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8("http://www.example.com/"));
     page.setContentType(new Utf8("text/plain"));
     page.setContent(ByteBuffer.wrap(contentInOctets));
@@ -85,11 +85,11 @@ public class TestEncodingDetector {
 
     // enable autodetection
     conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
-    page = new WebPage();
+    page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8("http://www.example.com/"));
     page.setContentType(new Utf8("text/plain"));
     page.setContent(ByteBuffer.wrap(contentInOctets));
-    page.putToMetadata(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
+    page.getMetadata().put(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
     
     detector = new EncodingDetector(conf);
     detector.autoDetectClues(page, true);