You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/08/06 15:57:42 UTC

svn commit: r1369847 - in /nutch/branches/2.x: ./ src/plugin/ src/plugin/index-anchor/ src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/ src/plugin/index-anchor/src/test/ src/plugin/index-anchor/src/test/org/ src/plugin/index-anchor/src...

Author: lewismc
Date: Mon Aug  6 13:57:41 2012
New Revision: 1369847

URL: http://svn.apache.org/viewvc?rev=1369847&view=rev
Log:
NUTCH-1159 Write JUnit test for index-anchor

Added:
    nutch/branches/2.x/src/plugin/index-anchor/src/test/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/
    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/build.xml
    nutch/branches/2.x/src/plugin/index-anchor/build.xml
    nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1369847&r1=1369846&r2=1369847&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Aug  6 13:57:41 2012
@@ -1,6 +1,9 @@
 Nutch Change Log
 
 Release 2.1 - Current Development
+
+* NUTCH-1159 Write JUnit test for index-anchor (ferdy + lewismc)
+
 * NUTCH-1445 Add ElasticIndexerJob that indexes to elasticsearch (ferdy)
 
 * NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat) (ferdy)

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1369847&r1=1369846&r2=1369847&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Mon Aug  6 13:57:41 2012
@@ -74,6 +74,7 @@
      <ant dir="parse-tika" target="test"/>
      <ant dir="protocol-file" target="test"/>
      <ant dir="parse-html" target="test"/>
+  	 <ant dir="index-anchor" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>

Modified: nutch/branches/2.x/src/plugin/index-anchor/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/build.xml?rev=1369847&r1=1369846&r2=1369847&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/build.xml (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/build.xml Mon Aug  6 13:57:41 2012
@@ -19,4 +19,4 @@
 
   <import file="../build-plugin.xml" />
 
-</project>
\ No newline at end of file
+</project>

Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1369847&r1=1369846&r2=1369847&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Mon Aug  6 13:57:41 2012
@@ -31,7 +31,9 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Indexing filter that indexes all inbound anchor text for a document.
+ * Indexing filter that offers an option to either index all inbound anchor text for 
+ * a document or deduplicate anchors. Deduplication does have it's con's, 
+ * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
  */
 public class AnchorIndexingFilter implements IndexingFilter {
 
@@ -44,21 +46,37 @@ public class AnchorIndexingFilter implem
   static {
     FIELDS.add(WebPage.Field.INLINKS);
   }
-
+  
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
 
     deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
     LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
-
+  
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return this.conf;
   }
-
+  
   public void addIndexBackendOptions(Configuration conf) {
   }
-
+  
+  /**
+   * The {@link AnchorIndexingFilter} filter object which supports boolean 
+   * configuration settings for the deduplication of anchors. 
+   * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   *  
+   * @param doc The {@link NutchDocument} object
+   * @param url URL to be filtered for anchor text
+   * @param page {@link WebPage} object relative to the URL
+   * @return filtered NutchDocument
+   */
   @Override
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
@@ -85,7 +103,10 @@ public class AnchorIndexingFilter implem
     
     return doc;
   }
-
+  
+  /**
+   * Gets all the fields for a given {@link WebPage}
+   */
   @Override
   public Collection<WebPage.Field> getFields() {
     return FIELDS;

Added: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1369847&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (added)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Mon Aug  6 13:57:41 2012
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.util.Collection;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * JUnit test case which tests
+ * 1. that anchor text is obtained
+ * 2. that anchor deduplication functionality is working
+ * 
+ * @author lewismc
+ *
+ */
+public class TestAnchorIndexingFilter extends TestCase {
+  
+  public static final Logger LOG = LoggerFactory.getLogger(TestAnchorIndexingFilter.class);
+	
+  @Test
+  public void testDeduplicateAnchor() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+    AnchorIndexingFilter filter = new AnchorIndexingFilter();
+    filter.setConf(conf);
+    NutchDocument doc = new NutchDocument();
+    WebPage page = new WebPage();
+    page.putToInlinks(new Utf8("http://example1.com/"), new Utf8("cool site"));
+    page.putToInlinks(new Utf8("http://example2.com/"), new Utf8("cool site"));
+    page.putToInlinks(new Utf8("http://example3.com/"), new Utf8("fun site"));
+    filter.filter(doc, "http://myurldoesnotmatter.com/", page);
+    
+    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
+    
+    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
+  }
+
+}