You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by sh...@apache.org on 2013/09/03 02:29:35 UTC

svn commit: r1519533 - in /manifoldcf/trunk: CHANGES.txt connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Author: shinichiro
Date: Tue Sep  3 00:29:35 2013
New Revision: 1519533

URL: http://svn.apache.org/r1519533
Log:
Fix for CONNECTORS-767

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1519533&r1=1519532&r2=1519533&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Sep  3 00:29:35 2013
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 1.4-dev =====================
 
+CONNECTORS-767: Add filename support to Web connector.
+(Shinichiro Abe)
+
 CONNECTORS-769: Include general_parentid in Livelink metadata.
 (David Morana, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1519533&r1=1519532&r2=1519533&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue Sep  3 00:29:35 2013
@@ -1343,6 +1343,17 @@ public class WebcrawlerConnector extends
 
           RepositoryDocument rd = new RepositoryDocument();
 
+          // Set the file name
+          String fileName = "";
+          try {
+            fileName = documentIdentifiertoFileName(documentIdentifier);
+          } catch (URISyntaxException e1) {
+            fileName = "";
+          }
+          if (fileName.length() > 0){
+            rd.setFileName(fileName);
+          }
+          
           // Set the content type
           rd.setMimeType(cache.getContentType(documentIdentifier));
           
@@ -5672,6 +5683,44 @@ public class WebcrawlerConnector extends
     return rval;
   }
 
+  /** Convert a document identifier to filename.
+   * @param documentIdentifier
+   * @return
+   * @throws URISyntaxException
+   */
+  protected String documentIdentifiertoFileName(String documentIdentifier) 
+    throws URISyntaxException
+  {
+    StringBuffer path = new StringBuffer();
+    URI uri = null;
+
+    uri = new URI(documentIdentifier);
+
+    if (uri.getRawPath() != null) {
+      if (uri.getRawPath().equals("")) {
+        path.append("");
+      } else if (uri.getRawPath().equals("/")) {
+        path.append("index.html");
+      } else if (uri.getRawPath().length() != 0) {
+        if (uri.getRawPath().endsWith("/")) {
+          path.append("index.html");
+        } else {
+          String[] names = uri.getRawPath().split("/"); 
+          path.append(names[names.length - 1]);
+        } 
+      }
+    }
+
+    if (path.length() > 0) {
+      if (uri.getRawQuery() != null) {
+        path.append("?");
+        path.append(uri.getRawQuery());
+      }
+    }
+
+    return path.toString();
+  }
+
   /** Find a redirection URI, if it exists */
   protected String findRedirectionURI(String currentURI)
     throws ManifoldCFException