You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2017/11/05 21:04:00 UTC

[jira] [Commented] (NUTCH-2443) Extract links from the video tag with the parse-html plugin

    [ https://issues.apache.org/jira/browse/NUTCH-2443?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16239723#comment-16239723 ] 

ASF GitHub Bot commented on NUTCH-2443:
---------------------------------------

sebastian-nagel closed pull request #230: NUTCH-2443 add source tag to the parse-html/tika outlink extractor
URL: https://github.com/apache/nutch/pull/230
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 909da7ef4..4527dd7b4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -86,6 +86,7 @@ public void setConf(Configuration conf) {
     linkParams.put("script", new LinkParams("script", "src", 0));
     linkParams.put("link", new LinkParams("link", "href", 0));
     linkParams.put("img", new LinkParams("img", "src", 0));
+    linkParams.put("source", new LinkParams("source", "src", 0));
 
     // remove unwanted link tags from the linkParams map
     String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 15725aee6..0faa013e9 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -127,7 +127,11 @@
           + "<a href=\"g\"><!--no anchor--></a>"
           + "<a href=\"g1\"> <!--whitespace-->  </a>"
           + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
-          + "</body></html>"), };
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<video width=\"320\" height=\"240\" controls> "
+          + "<source src=\"movie.mp4\" type=\"video/mp4\">"
+          + "</video>" + "</body></html>"), };
 
   private static int SKIP = 9;
 
@@ -137,7 +141,8 @@
       "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
       "http://www.nutch.org//", "http://www.nutch.org/",
       "http://www.nutch.org/", "http://www.nutch.org/",
-      "http://www.nutch.org/;something", "http://www.nutch.org/" };
+      "http://www.nutch.org/;something", "http://www.nutch.org/",
+      "http://www.nutch.org/" };
 
   private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
 
@@ -157,11 +162,11 @@
           + "one two two three three four put some text here and there. "
           + "End this madness ! . . . .", "ignore ignore", "test1 test2",
       "test1 test2", "title anchor1 anchor2 anchor3",
-      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title", "" };
 
   private static final String[] answerTitle = { "title", "title", "",
       "my title", "my title", "my title", "my title", "", "", "", "title",
-      "title", "title" };
+      "title", "title", "" };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
@@ -231,7 +236,8 @@ public void setup() {
           { new Outlink("http://www.nutch.org/g", ""),
               new Outlink("http://www.nutch.org/g1", ""),
               new Outlink("http://www.nutch.org/g2", "bla bla"),
-              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
+              new Outlink("http://www.nutch.org/test.gif", "bla bla"), },
+          { new Outlink("http://www.nutch.org/movie.mp4", "") } };
 
     } catch (MalformedURLException e) {
 
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index e5dbd16a9..af85480bc 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -90,6 +90,7 @@ public void setConf(Configuration conf) {
     linkParams.put("script", new LinkParams("script", "src", 0));
     linkParams.put("link", new LinkParams("link", "href", 0));
     linkParams.put("img", new LinkParams("img", "src", 0));
+    linkParams.put("source", new LinkParams("source", "src", 0));
 
     // remove unwanted link tags from the linkParams map
     String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
index 96029a6b4..2159b9d5a 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -130,7 +130,11 @@
       new String("<html><head><title> title </title>" + "</head><body>"
           + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
           + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
-          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<video width=\"320\" height=\"240\" controls> "
+          + "<source src=\"movie.mp4\" type=\"video/mp4\">"
+          + "</video>" + "</body></html>"), };
 
   private static int SKIP = 9;
 
@@ -140,7 +144,7 @@
       "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
       "http://www.nutch.org//", "http://www.nutch.org/",
       "http://www.nutch.org/", "http://www.nutch.org/",
-      "http://www.nutch.org/;something" };
+      "http://www.nutch.org/;something", "http://www.nutch.org/" };
 
   private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
 
@@ -160,11 +164,11 @@
           + "one two two three three four put some text here and there. "
           + "End this madness ! . . . .", "ignore ignore", "test1 test2",
       "test1 test2", "title anchor1 anchor2 anchor3",
-      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "" };
 
   private static final String[] answerTitle = { "title", "title", "",
       "my title", "my title", "my title", "my title", "", "", "", "title",
-      "title" };
+      "title", "" };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
@@ -225,7 +229,8 @@ public void setup() throws Exception {
             new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
             new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
             new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
-                "anchor5") } };
+                "anchor5") },
+        { new Outlink("http://www.nutch.org/movie.mp4", "") } };
 
   }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Extract links from the video tag with the parse-html plugin
> -----------------------------------------------------------
>
>                 Key: NUTCH-2443
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2443
>             Project: Nutch
>          Issue Type: Improvement
>          Components: parser, plugin
>    Affects Versions: 1.13
>            Reporter: Jorge Luis Betancourt Gonzalez
>            Assignee: Jorge Luis Betancourt Gonzalez
>            Priority: Minor
>             Fix For: 1.14
>
>
> At the moment the {{parse-html}} extracts links from the tags {{a, area, form}} (configurable){{, frame, iframe, script, link, img}}. Since we allow extracting links to binary files (images) extracting links also from the {{video}} tag should be supported.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)