You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by gr...@apache.org on 2005/05/23 02:34:23 UTC
svn commit: r176412 -
/lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Author: gregor
Date: Sun May 22 17:34:22 2005
New Revision: 176412
URL: http://svn.apache.org/viewcvs?rev=176412&view=rev
Log:
Simplied parsing code
Modified:
lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Modified: lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java?rev=176412&r1=176411&r2=176412&view=diff
==============================================================================
--- lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java (original)
+++ lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java Sun May 22 17:34:22 2005
@@ -265,18 +265,15 @@
/**
* Parse a URL
* @param urlString URL to parse
- * @return ok if the parse succeeded, or an error message if it did not
- * FIXME why does this return null?
+ * @return a list of URL
*/
public List parsePage(String urlString) {
- String status = "ok";
-
- try {
- URL currentURL = new java.net.URL(urlString);
- HttpURLConnection httpCon = (HttpURLConnection) currentURL.openConnection();
+ HttpURLConnection httpCon = null;
+ try {
+ URL currentURL = new URL(urlString);
+ httpCon = (HttpURLConnection) currentURL.openConnection();
httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
-
httpCon.connect();
if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
@@ -286,25 +283,14 @@
return handleHTML(httpCon);
} else if (contentType.indexOf("application/pdf") != -1) {
handlePDF(httpCon);
- } else {
- status = "Not an excepted content type : " + contentType;
}
- } else {
- status = "bad";
}
+ } catch (final Exception e) {
+ log.error(e.getMessage(), e);
+ } finally {
httpCon.disconnect();
- } catch (java.net.MalformedURLException mue) {
- status = mue.toString();
- } catch (java.net.UnknownHostException uh) {
- status = uh.toString(); // Mark as a bad URL
- } catch (java.io.IOException ioe) {
- status = ioe.toString(); // Mark as a bad URL
- } catch (Exception e) {
- status = e.toString(); // Mark as a bad URL
}
-
- //return status;
return null;
}
@@ -501,12 +487,8 @@
httpConnection.disconnect();
log.info("URL dumped: " + url + " (" + file + ")");
- } catch (final FileNotFoundException e) {
- log.error("" + e);
- log.error("URL not dumped: " + url);
- } catch (final IOException e) {
- log.error("" + e);
- log.error("URL not dumped: " + url);
+ } catch (final Exception e) {
+ log.error(e.getMessage(), e);
} finally {
try {
if (in !=null)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org