You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/07/05 21:25:41 UTC

[4/6] any23 git commit: ANY23-356 Updated xercesImpl, crawler4j

ANY23-356 Updated xercesImpl, crawler4j


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/a5c3f1c4
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/a5c3f1c4
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/a5c3f1c4

Branch: refs/heads/master
Commit: a5c3f1c4a225e8165e97d35cf9eb6502087526e7
Parents: 6b76f34
Author: Hans <fi...@gmail.com>
Authored: Mon Jul 2 09:52:09 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Mon Jul 2 18:24:38 2018 -0500

----------------------------------------------------------------------
 core/src/main/java/org/apache/any23/rdf/RDFUtils.java  |  4 ++--
 plugins/basic-crawler/pom.xml                          |  2 +-
 .../apache/any23/plugin/crawler/DefaultWebCrawler.java | 13 +++++++++----
 plugins/html-scraper/pom.xml                           |  2 +-
 4 files changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
index d323fb3..242984b 100644
--- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
+++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
@@ -528,10 +528,10 @@ public class RDFUtils {
             new java.net.URI(href.trim());
             return true;
         } catch (IllegalArgumentException e) {
-            LOG.debug("Error processing href: {}", href, e);
+            LOG.trace("Error processing href: {}", href, e);
             return false;
         } catch (URISyntaxException e) {
-            LOG.debug("Error interpreting href: {} as URI.", href, e);
+            LOG.trace("Error interpreting href: {} as URI.", href, e);
             return false;
         }
     }

http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/basic-crawler/pom.xml b/plugins/basic-crawler/pom.xml
index 58063ee..b864b92 100644
--- a/plugins/basic-crawler/pom.xml
+++ b/plugins/basic-crawler/pom.xml
@@ -74,7 +74,7 @@
     <dependency>
       <groupId>edu.uci.ics</groupId>
       <artifactId>crawler4j</artifactId>
-      <version>3.4</version>
+      <version>4.4.0</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
----------------------------------------------------------------------
diff --git a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
index 2451286..2e43445 100644
--- a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
+++ b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
@@ -47,12 +47,17 @@ public class DefaultWebCrawler extends WebCrawler {
     /**
      * Override this method to specify whether the given URL should be visited or not.
      */
+
     @Override
-    public boolean shouldVisit(WebURL url) {
-        if (url.getURL() == null) return false;
+    public boolean shouldVisit(Page referringPage, WebURL url) {
+        if (!super.shouldVisit(referringPage, url))
+            return false;
+        if (url.getURL() == null)
+            return false;
         final String href = url.getURL().toLowerCase();
-        if( ! href.startsWith( sharedData.getSeed() ) ) return false;
-        return pattern == null || ! pattern.matcher(href).matches();
+        if (!href.startsWith(sharedData.getSeed()))
+            return false;
+        return pattern == null || !pattern.matcher(href).matches();
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/html-scraper/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml
index 5f47adb..e24f6b6 100644
--- a/plugins/html-scraper/pom.xml
+++ b/plugins/html-scraper/pom.xml
@@ -56,7 +56,7 @@
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
-      <version>2.9.1</version>
+      <version>2.12.0</version>
       <scope>provided</scope>
       <exclusions>
         <exclusion>