You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/07/05 21:25:41 UTC
[4/6] any23 git commit: ANY23-356 Updated xercesImpl, crawler4j
ANY23-356 Updated xercesImpl, crawler4j
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/a5c3f1c4
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/a5c3f1c4
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/a5c3f1c4
Branch: refs/heads/master
Commit: a5c3f1c4a225e8165e97d35cf9eb6502087526e7
Parents: 6b76f34
Author: Hans <fi...@gmail.com>
Authored: Mon Jul 2 09:52:09 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Mon Jul 2 18:24:38 2018 -0500
----------------------------------------------------------------------
core/src/main/java/org/apache/any23/rdf/RDFUtils.java | 4 ++--
plugins/basic-crawler/pom.xml | 2 +-
.../apache/any23/plugin/crawler/DefaultWebCrawler.java | 13 +++++++++----
plugins/html-scraper/pom.xml | 2 +-
4 files changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
index d323fb3..242984b 100644
--- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
+++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
@@ -528,10 +528,10 @@ public class RDFUtils {
new java.net.URI(href.trim());
return true;
} catch (IllegalArgumentException e) {
- LOG.debug("Error processing href: {}", href, e);
+ LOG.trace("Error processing href: {}", href, e);
return false;
} catch (URISyntaxException e) {
- LOG.debug("Error interpreting href: {} as URI.", href, e);
+ LOG.trace("Error interpreting href: {} as URI.", href, e);
return false;
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/basic-crawler/pom.xml b/plugins/basic-crawler/pom.xml
index 58063ee..b864b92 100644
--- a/plugins/basic-crawler/pom.xml
+++ b/plugins/basic-crawler/pom.xml
@@ -74,7 +74,7 @@
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
- <version>3.4</version>
+ <version>4.4.0</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
----------------------------------------------------------------------
diff --git a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
index 2451286..2e43445 100644
--- a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
+++ b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java
@@ -47,12 +47,17 @@ public class DefaultWebCrawler extends WebCrawler {
/**
* Override this method to specify whether the given URL should be visited or not.
*/
+
@Override
- public boolean shouldVisit(WebURL url) {
- if (url.getURL() == null) return false;
+ public boolean shouldVisit(Page referringPage, WebURL url) {
+ if (!super.shouldVisit(referringPage, url))
+ return false;
+ if (url.getURL() == null)
+ return false;
final String href = url.getURL().toLowerCase();
- if( ! href.startsWith( sharedData.getSeed() ) ) return false;
- return pattern == null || ! pattern.matcher(href).matches();
+ if (!href.startsWith(sharedData.getSeed()))
+ return false;
+ return pattern == null || !pattern.matcher(href).matches();
}
/**
http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/html-scraper/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml
index 5f47adb..e24f6b6 100644
--- a/plugins/html-scraper/pom.xml
+++ b/plugins/html-scraper/pom.xml
@@ -56,7 +56,7 @@
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
- <version>2.9.1</version>
+ <version>2.12.0</version>
<scope>provided</scope>
<exclusions>
<exclusion>