You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by gr...@apache.org on 2005/05/23 02:34:23 UTC

svn commit: r176412 - /lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java

Author: gregor
Date: Sun May 22 17:34:22 2005
New Revision: 176412

URL: http://svn.apache.org/viewcvs?rev=176412&view=rev
Log:
Simplied parsing code

Modified:
    lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java

Modified: lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java?rev=176412&r1=176411&r2=176412&view=diff
==============================================================================
--- lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java (original)
+++ lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java Sun May 22 17:34:22 2005
@@ -265,18 +265,15 @@
     /**
      * Parse a URL
      * @param urlString URL to parse
-     * @return ok if the parse succeeded, or an error message if it did not
-     * FIXME why does this return null?
+     * @return a list of URL
      */
     public List parsePage(String urlString) {
-        String status = "ok";
-
-        try {
-            URL currentURL = new java.net.URL(urlString);
-            HttpURLConnection httpCon = (HttpURLConnection) currentURL.openConnection();
+    	HttpURLConnection httpCon = null;
 
+    	try {
+            URL currentURL = new URL(urlString);
+            httpCon = (HttpURLConnection) currentURL.openConnection();
             httpCon.setRequestProperty("User-Agent", "Lenya Lucene Crawler");
-
             httpCon.connect();
 
             if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
@@ -286,25 +283,14 @@
                     return handleHTML(httpCon);
                 } else if (contentType.indexOf("application/pdf") != -1) {
                     handlePDF(httpCon);
-                } else {
-                    status = "Not an excepted content type : " + contentType;
                 }
-            } else {
-                status = "bad";
             }
 
+        } catch (final Exception e) {
+            log.error(e.getMessage(), e);
+        } finally {
             httpCon.disconnect();
-        } catch (java.net.MalformedURLException mue) {
-            status = mue.toString();
-        } catch (java.net.UnknownHostException uh) {
-            status = uh.toString(); // Mark as a bad URL
-        } catch (java.io.IOException ioe) {
-            status = ioe.toString(); // Mark as a bad URL
-        } catch (Exception e) {
-            status = e.toString(); // Mark as a bad URL
         }
-
-        //return status;
         return null;
     }
 
@@ -501,12 +487,8 @@
                 httpConnection.disconnect();
 
                 log.info("URL dumped: " + url + " (" + file + ")");
-            } catch (final FileNotFoundException e) {
-                log.error("" + e);
-                log.error("URL not dumped: " + url);
-            } catch (final IOException e) {
-                log.error("" + e);
-                log.error("URL not dumped: " + url);
+            } catch (final Exception e) {
+                log.error(e.getMessage(), e);
             } finally {
                 try {
                     if (in !=null)



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org