You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/01 13:11:28 UTC

[1/2] nutch git commit: NUTCH-2291 - Fix mrunit dependencies - remove classifier from dependency because pom file name on Maven repository does not contain a classifier

Repository: nutch
Updated Branches:
  refs/heads/master cb6fbae51 -> 34050adae


NUTCH-2291 - Fix mrunit dependencies
- remove classifier from dependency because pom file name on Maven repository does not contain a classifier


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/c18e19bf
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/c18e19bf
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/c18e19bf

Branch: refs/heads/master
Commit: c18e19bfe63c3ac5221d1a0f454b9e1a037a4386
Parents: cb6fbae
Author: Sebastian Nagel <sn...@apache.org>
Authored: Fri Jul 1 14:45:41 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Fri Jul 1 14:45:41 2016 +0200

----------------------------------------------------------------------
 ivy/ivy.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/c18e19bf/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index a4e9481..a9a83ae 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -97,7 +97,7 @@
 
 		<!--artifacts needed for testing -->
 		<dependency org="junit" name="junit" rev="4.11" conf="test->default" />
-		<dependency org="org.apache.mrunit" name="mrunit" rev="1.1.0" conf="test->default" maven:classifier="hadoop2">
+		<dependency org="org.apache.mrunit" name="mrunit" rev="1.1.0" conf="test->default">
 			<artifact name="mrunit" maven:classifier="hadoop2" />
 			<exclude org="log4j" module="log4j" />
 		</dependency>


[2/2] nutch git commit: NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - fix broken unit test (fix HTML markup, make test for meta data extraction obligatory) - add all values of general metadata to parse metadata

Posted by sn...@apache.org.
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html
- fix broken unit test (fix HTML markup, make test for meta data extraction obligatory)
- add all values of general metadata to parse metadata


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada

Branch: refs/heads/master
Commit: 34050adae0896a6d7ddb254a1622a03af6e07175
Parents: c18e19b
Author: Sebastian Nagel <sn...@apache.org>
Authored: Fri Jul 1 15:07:52 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Fri Jul 1 15:10:49 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/metadata/Metadata.java     | 25 ++++++++++++++++++++
 .../org/apache/nutch/parse/html/HtmlParser.java |  4 +---
 .../apache/nutch/parse/html/TestHtmlParser.java | 11 ++++-----
 3 files changed, 31 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java
index f0bfcd3..8a57ee3 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -123,6 +123,31 @@ public class Metadata implements Writable, CreativeCommons, DublinCore,
   }
 
   /**
+   * Add all name/value mappings (merge two metadata mappings). If a name
+   * already exists in current metadata the values are added to existing values.
+   *
+   * @param metadata
+   *          other Metadata to be merged
+   */
+  public void addAll(Metadata metadata) {
+    for (String name : metadata.names()) {
+      String[] addValues = metadata.getValues(name);
+      if (addValues == null)
+        continue;
+      String[] oldValues = this.metadata.get(name);
+      if (oldValues == null) {
+        this.metadata.put(name, addValues);
+      } else {
+        String[] newValues = new String[oldValues.length + addValues.length];
+        System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+        System.arraycopy(addValues, 0, newValues, oldValues.length,
+            addValues.length);
+        this.metadata.put(name, newValues);
+      }
+    }
+  }
+
+  /**
    * Copy All key-value pairs from properties.
    * 
    * @param properties

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index b6666aa..4d043ba 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -183,9 +183,7 @@ public class HtmlParser implements Parser {
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
 
     // populate Nutch metadata with HTML meta directives
-    for (String name : metaTags.getGeneralTags().names()) {
-      metadata.add(name, metaTags.getGeneralTags().get(name));
-    }
+    metadata.addAll(metaTags.getGeneralTags());
 
     if (LOG.isTraceEnabled()) {
       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index bcfe9e4..7099f50 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -40,8 +40,8 @@ public class TestHtmlParser {
   private static final String encodingTestBody = "<ul>\n  <li>fran�ais\n  <li>espa�ol\n  <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n  <li>\u010de\u0161tina\n  <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
   private static final String encodingTestContent = "<title>"
       + encodingTestKeywords + "</title>\n"
-      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
-      + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
 
   private static String[][] encodingTestPages = {
       {
@@ -113,10 +113,9 @@ public class TestHtmlParser {
         Assert.assertTrue(keyword + " not found in text (" + name + ")",
             text.contains(keyword));
       }
-      if (keywords != null) {
-        Assert.assertEquals("Keywords not extracted properly (" + name + ")",
-            encodingTestKeywords, keywords);
-      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
     }
   }