You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/01 13:11:29 UTC
[2/2] nutch git commit: NUTCH-1553 Property
'indexer.delete.robots.noindex' not working when using parser-html - fix
broken unit test (fix HTML markup,
make test for meta data extraction obligatory) - add all values of general
metadata to parse metadata
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html
- fix broken unit test (fix HTML markup, make test for meta data extraction obligatory)
- add all values of general metadata to parse metadata
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada
Branch: refs/heads/master
Commit: 34050adae0896a6d7ddb254a1622a03af6e07175
Parents: c18e19b
Author: Sebastian Nagel <sn...@apache.org>
Authored: Fri Jul 1 15:07:52 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Fri Jul 1 15:10:49 2016 +0200
----------------------------------------------------------------------
.../org/apache/nutch/metadata/Metadata.java | 25 ++++++++++++++++++++
.../org/apache/nutch/parse/html/HtmlParser.java | 4 +---
.../apache/nutch/parse/html/TestHtmlParser.java | 11 ++++-----
3 files changed, 31 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java
index f0bfcd3..8a57ee3 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -123,6 +123,31 @@ public class Metadata implements Writable, CreativeCommons, DublinCore,
}
/**
+ * Add all name/value mappings (merge two metadata mappings). If a name
+ * already exists in current metadata the values are added to existing values.
+ *
+ * @param metadata
+ * other Metadata to be merged
+ */
+ public void addAll(Metadata metadata) {
+ for (String name : metadata.names()) {
+ String[] addValues = metadata.getValues(name);
+ if (addValues == null)
+ continue;
+ String[] oldValues = this.metadata.get(name);
+ if (oldValues == null) {
+ this.metadata.put(name, addValues);
+ } else {
+ String[] newValues = new String[oldValues.length + addValues.length];
+ System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+ System.arraycopy(addValues, 0, newValues, oldValues.length,
+ addValues.length);
+ this.metadata.put(name, newValues);
+ }
+ }
+ }
+
+ /**
* Copy All key-value pairs from properties.
*
* @param properties
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index b6666aa..4d043ba 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -183,9 +183,7 @@ public class HtmlParser implements Parser {
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
- for (String name : metaTags.getGeneralTags().names()) {
- metadata.add(name, metaTags.getGeneralTags().get(name));
- }
+ metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index bcfe9e4..7099f50 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -40,8 +40,8 @@ public class TestHtmlParser {
private static final String encodingTestBody = "<ul>\n <li>fran�ais\n <li>espa�ol\n <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n <li>\u010de\u0161tina\n <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
private static final String encodingTestContent = "<title>"
+ encodingTestKeywords + "</title>\n"
- + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
- + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+ + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+ + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
private static String[][] encodingTestPages = {
{
@@ -113,10 +113,9 @@ public class TestHtmlParser {
Assert.assertTrue(keyword + " not found in text (" + name + ")",
text.contains(keyword));
}
- if (keywords != null) {
- Assert.assertEquals("Keywords not extracted properly (" + name + ")",
- encodingTestKeywords, keywords);
- }
+ Assert.assertNotNull("No keywords extracted", keywords);
+ Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+ encodingTestKeywords, keywords);
}
}