You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/11/13 20:19:16 UTC

svn commit: r1408898 - in /nutch/trunk: ./ src/plugin/ src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/ src/plugin/index-anchor/src/test/ src/plugin/index-anchor/src/test/org/ src/plugin/index-anchor/src/test/org/apache/ src/plugin/ind...

Author: lewismc
Date: Tue Nov 13 19:19:15 2012
New Revision: 1408898

URL: http://svn.apache.org/viewvc?rev=1408898&view=rev
Log:
NUTCH-1117 JUnit test for index-anchor

Added:
    nutch/trunk/src/plugin/index-anchor/src/test/
    nutch/trunk/src/plugin/index-anchor/src/test/org/
    nutch/trunk/src/plugin/index-anchor/src/test/org/apache/
    nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/
    nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/build.xml
    nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1408898&r1=1408897&r2=1408898&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 13 19:19:15 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1117 JUnit test for index-anchor (lewismc)
+
 * NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc)
 
 * NUTCH-1488 bin/nutch to run junit from any directory (snagel via lewismc)

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1408898&r1=1408897&r2=1408898&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Nov 13 19:19:15 2012
@@ -76,6 +76,7 @@
   <target name="test">
     <parallel threadCount="2">
      <ant dir="creativecommons" target="test"/>
+     <ant dir="index-anchor" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="lib-http" target="test"/>

Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1408898&r1=1408897&r2=1408898&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Tue Nov 13 19:19:15 2012
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -30,26 +30,44 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Indexing filter that indexes all inbound anchor text for a document. 
+ * Indexing filter that offers an option to either index all inbound anchor text for 
+ * a document or deduplicate anchors. Deduplication does have it's con's, 
+ * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
  */
-public class AnchorIndexingFilter
-  implements IndexingFilter {
+public class AnchorIndexingFilter implements IndexingFilter {
 
   public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class);
   private Configuration conf;
   private boolean deduplicate = false;
 
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
 
     deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
     LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
-
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return this.conf;
   }
 
+  /**
+   * The {@link AnchorIndexingFilter} filter object which supports boolean 
+   * configuration settings for the deduplication of anchors. 
+   * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   *  
+   * @param doc The {@link NutchDocument} object
+   * @param parse The relevant {@link Parse} object passing through the filter 
+   * @param url URL to be filtered for anchor text
+   * @param datum The {@link CrawlDatum} entry
+   * @param inlinks The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
     Inlinks inlinks) throws IndexingException {
 

Added: nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1408898&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (added)
+++ nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Tue Nov 13 19:19:15 2012
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests
+ * 1. that anchor text is obtained
+ * 2. that anchor deduplication functionality is working
+ * 
+ * @author lewismc
+ *
+ */
+public class TestAnchorIndexingFilter extends TestCase {
+
+  @Test
+  public void testDeduplicateAnchor() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+    AnchorIndexingFilter filter = new AnchorIndexingFilter();
+    filter.setConf(conf);
+    assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://test1.com/", "text1"));
+    inlinks.add(new Inlink("http://test2.com/", "text2"));
+    inlinks.add(new Inlink("http://test3.com/", "text2"));
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
+    } catch(Exception e){
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+    assertNotNull(doc);
+    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
+    assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
+  }
+
+}