You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/04 20:09:15 UTC
svn commit: r723398 -
/lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
Author: kubes
Date: Thu Dec 4 11:09:15 2008
New Revision: 723398
URL: http://svn.apache.org/viewvc?rev=723398&view=rev
Log:
NUTCH-647: Resolve URLs tool. New methods for URLUtil should have been included but weren't.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=723398&r1=723397&r2=723398&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Dec 4 11:09:15 2008
@@ -194,6 +194,42 @@
return temp ? src : dst;
}
+ /**
+ * Returns the lowercased hostname for the url or null if the url is not well
+ * formed.
+ *
+ * @param url The url to check.
+ * @return String The hostname for the url.
+ */
+ public static String getHost(String url) {
+ try {
+ return new URL(url).getHost().toLowerCase();
+ }
+ catch (MalformedURLException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Returns the page for the url. The page consists of the protocol, host,
+ * and path, but does not include the query string. The host is lowercased
+ * but the path is not.
+ *
+ * @param url The url to check.
+ * @return String The page for the url.
+ */
+ public static String getPage(String url) {
+ try {
+ // get the full url, and replace the query string with and empty string
+ url = url.toLowerCase();
+ String queryStr = new URL(url).getQuery();
+ return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
+ }
+ catch (MalformedURLException e) {
+ return null;
+ }
+ }
+
/** For testing */
public static void main(String[] args){