You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oodt.apache.org by bf...@apache.org on 2011/06/03 23:52:21 UTC

svn commit: r1131258 - in /oodt/branches/protocol/protocol-http/src: main/java/org/apache/oodt/cas/protocol/http/ main/java/org/apache/oodt/cas/protocol/http/util/ test/org/apache/oodt/cas/protocol/http/util/

Author: bfoster
Date: Fri Jun  3 21:52:21 2011
New Revision: 1131258

URL: http://svn.apache.org/viewvc?rev=1131258&view=rev
Log:

- unit-test updates 

---------------
OODT-194

Modified:
    oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java
    oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java
    oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java

Modified: oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java (original)
+++ oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java Fri Jun  3 21:52:21 2011
@@ -37,9 +37,6 @@ import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-import java.util.StringTokenizer;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /**
  * 
@@ -52,25 +49,12 @@ import java.util.regex.Pattern;
  */
 public class HttpProtocol implements Protocol {
 
-  static String DIR = "dir";
+  private static Map<String, List<HttpFile>> linkChildren = new HashMap<String, List<HttpFile>>();
 
-  static String FILE = "file";
-
-  static String IGNORE = "ignore";
-
-  static Map<String, List<HttpFile>> linkChildren = new HashMap<String, List<HttpFile>>();
-
-  static boolean takeAllFiles = true;
-
-  HttpFile parentFile;
-
-  boolean abort;
-
-  HttpFile currentFile;
-
-  boolean isConnected;
-
-  URL currentURL;
+  private HttpFile parentFile;
+  private HttpFile currentFile;
+  private boolean isConnected;
+  private URL currentURL;
   
   public HttpProtocol() {
     isConnected = false;
@@ -120,7 +104,6 @@ public class HttpProtocol implements Pro
     OutputStream out = null;
     InputStream in = null;
     try {
-      this.abort = false;
       out = new BufferedOutputStream(new FileOutputStream(toFile));
       if (fromFile instanceof HttpFile) {
     	  in = ((HttpFile) fromFile).getLink().openStream();
@@ -131,7 +114,7 @@ public class HttpProtocol implements Pro
       byte[] buffer = new byte[1024];
       int numRead;
       long numWritten = 0;
-      while ((numRead = in.read(buffer)) != -1 && !this.abort) {
+      while ((numRead = in.read(buffer)) != -1) {
         out.write(buffer, 0, numRead);
         numWritten += numRead;
       }
@@ -211,111 +194,111 @@ public class HttpProtocol implements Pro
     return children;
   }
 
-  public static String findLinkInATag(String aTag) {
-    // find 'href' attribute
-    String find = aTag.substring(aTag.indexOf("href") + 4);
-    // USE STRICT FINDING FIRST
-    // (['\"])\s*?[(http)(./)(..)/#].+?\\1
-    // finds link between ' or ", which starts with one of
-    // the following: http, ./, .., /, #
-    // these starting possibilities can then be followed any
-    // number of characters until the corresponding
-    // ' or " is reached.
-    String patternRegExp = "(['\"])\\s*?[\\(http\\)\\(\\./\\)\\(\\.\\.\\)/#].+?\\1";
-    Pattern linkPattern = Pattern.compile(patternRegExp);
-    Matcher linkMatch = linkPattern.matcher(find);
-    if (linkMatch.find())
-      find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
-    else {
-      // RELAX FINDING SOME
-      patternRegExp = "(['\"])\\s*?[^./].+?\\1";
-      linkPattern = Pattern.compile(patternRegExp);
-      linkMatch = linkPattern.matcher(find);
-      if (linkMatch.find())
-        find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
-      else {
-        // EXTREMELY RELAX FINDING
-        patternRegExp = "[^\"='/>\\s]+?[^\\s>\"']*?";
-        linkPattern = Pattern.compile(patternRegExp);
-        linkMatch = linkPattern.matcher(find);
-        if (linkMatch.find())
-          find = find.substring(linkMatch.start(), linkMatch.end());
-        else {
-          return null;
-        }
-      }
-    }
-    return find;
-  }
-
-  public static String createLinkFromHref(HttpFile parent, String href) {
-    if (!href.startsWith("http")) {
-      String link = parent.getLink().toExternalForm();
-      if (href.startsWith("..")) {
-        int index = link.substring(0, link.lastIndexOf("/")).lastIndexOf("/");
-        href = (index < 7) ? link + href.substring(2) : link.substring(0, link
-            .substring(0, link.lastIndexOf("/")).lastIndexOf("/"))
-            + href.substring(2);
-      } else if (href.startsWith("./")) {
-        int index = link.lastIndexOf("/");
-        href = (index < 7) ? link + href.substring(1) : link
-            .substring(0, index)
-            + href.substring(1);
-      } else if (href.startsWith("/")) {
-        URL url = parent.getLink();
-        href = url.getProtocol() + "://" + url.getHost() + href;
-      } else {
-        // find the last / in current link
-        int index = link.lastIndexOf("/");
-        // (index < 7) checks if in the current link, "/" only exists
-        // in the protocol section of link (i.e. http://jpl.nasa.gov)
-        href = (index < 7) ? link + "/" + href : link.substring(0, index) + "/"
-            + href;
-      }
-    }
-
-    // remove "/" at end of link
-    if (href.endsWith("/"))
-      href = href.substring(0, href.length() - 1);
-    href = href.trim();
-
-    return href;
-  }
-
-  public ProtocolFile getProtocolFileFor(String path, boolean isDir)
-      throws ProtocolException {
-    try {
-      StringTokenizer st = new StringTokenizer(path, "/ ");
-      HttpFile curPath = this.parentFile;
-      // System.out.println(parentPath);
-      if (st.hasMoreTokens()) {
-        do {
-          String token = st.nextToken();
-          List<HttpFile> children = this.parseLink(curPath);
-          for (HttpFile pFile : children) {
-            if (pFile.getName().equals(token)) {
-              // System.out.println("token " + token + " " +
-              // pFile);
-              curPath = pFile;
-              continue;
-            }
-          }
-        } while (st.hasMoreTokens());
-        if (curPath.equals(this.parentFile))
-          return new HttpFile(path, isDir, new URL("http://"
-                  + this.getSite().getHost() + path), curPath);
-      }
-      return curPath;
-    } catch (Exception e) {
-      throw new ProtocolException("Failed to get ProtocolPath for " + path);
-    }
-  }
+//  public static String findLinkInATag(String aTag) {
+//    // find 'href' attribute
+//    String find = aTag.substring(aTag.indexOf("href") + 4);
+//    // USE STRICT FINDING FIRST
+//    // (['\"])\s*?[(http)(./)(..)/#].+?\\1
+//    // finds link between ' or ", which starts with one of
+//    // the following: http, ./, .., /, #
+//    // these starting possibilities can then be followed any
+//    // number of characters until the corresponding
+//    // ' or " is reached.
+//    String patternRegExp = "(['\"])\\s*?[\\(http\\)\\(\\./\\)\\(\\.\\.\\)/#].+?\\1";
+//    Pattern linkPattern = Pattern.compile(patternRegExp);
+//    Matcher linkMatch = linkPattern.matcher(find);
+//    if (linkMatch.find())
+//      find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
+//    else {
+//      // RELAX FINDING SOME
+//      patternRegExp = "(['\"])\\s*?[^./].+?\\1";
+//      linkPattern = Pattern.compile(patternRegExp);
+//      linkMatch = linkPattern.matcher(find);
+//      if (linkMatch.find())
+//        find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
+//      else {
+//        // EXTREMELY RELAX FINDING
+//        patternRegExp = "[^\"='/>\\s]+?[^\\s>\"']*?";
+//        linkPattern = Pattern.compile(patternRegExp);
+//        linkMatch = linkPattern.matcher(find);
+//        if (linkMatch.find())
+//          find = find.substring(linkMatch.start(), linkMatch.end());
+//        else {
+//          return null;
+//        }
+//      }
+//    }
+//    return find;
+//  }
+//
+//  public static String createLinkFromHref(HttpFile parent, String href) {
+//    if (!href.startsWith("http")) {
+//      String link = parent.getLink().toExternalForm();
+//      if (href.startsWith("..")) {
+//        int index = link.substring(0, link.lastIndexOf("/")).lastIndexOf("/");
+//        href = (index < 7) ? link + href.substring(2) : link.substring(0, link
+//            .substring(0, link.lastIndexOf("/")).lastIndexOf("/"))
+//            + href.substring(2);
+//      } else if (href.startsWith("./")) {
+//        int index = link.lastIndexOf("/");
+//        href = (index < 7) ? link + href.substring(1) : link
+//            .substring(0, index)
+//            + href.substring(1);
+//      } else if (href.startsWith("/")) {
+//        URL url = parent.getLink();
+//        href = url.getProtocol() + "://" + url.getHost() + href;
+//      } else {
+//        // find the last / in current link
+//        int index = link.lastIndexOf("/");
+//        // (index < 7) checks if in the current link, "/" only exists
+//        // in the protocol section of link (i.e. http://jpl.nasa.gov)
+//        href = (index < 7) ? link + "/" + href : link.substring(0, index) + "/"
+//            + href;
+//      }
+//    }
+//
+//    // remove "/" at end of link
+//    if (href.endsWith("/"))
+//      href = href.substring(0, href.length() - 1);
+//    href = href.trim();
+//
+//    return href;
+//  }
+//
+//  public ProtocolFile getProtocolFileFor(String path, boolean isDir)
+//      throws ProtocolException {
+//    try {
+//      StringTokenizer st = new StringTokenizer(path, "/ ");
+//      HttpFile curPath = this.parentFile;
+//      // System.out.println(parentPath);
+//      if (st.hasMoreTokens()) {
+//        do {
+//          String token = st.nextToken();
+//          List<HttpFile> children = this.parseLink(curPath);
+//          for (HttpFile pFile : children) {
+//            if (pFile.getName().equals(token)) {
+//              // System.out.println("token " + token + " " +
+//              // pFile);
+//              curPath = pFile;
+//              continue;
+//            }
+//          }
+//        } while (st.hasMoreTokens());
+//        if (curPath.equals(this.parentFile))
+//          return new HttpFile(path, isDir, new URL("http://"
+//                  + this.getSite().getHost() + path), curPath);
+//      }
+//      return curPath;
+//    } catch (Exception e) {
+//      throw new ProtocolException("Failed to get ProtocolPath for " + path);
+//    }
+//  }
 
   public void delete(ProtocolFile file) {}
 
-  private URL getSite() {
-	return currentURL;  
-  }
+//  private URL getSite() {
+//	return currentURL;  
+//  }
   
   public static void main(String[] args) throws Exception {
     String urlString = null, downloadToDir = null;

Modified: oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java (original)
+++ oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java Fri Jun  3 21:52:21 2011
@@ -42,7 +42,11 @@ import org.apache.oodt.cas.protocol.http
  */
 public class HttpUtils {
 
-  private static MimeTypeUtils mimeTypes = new MimeTypeUtils();
+  static final MimeTypeUtils MIME_TYPES = new MimeTypeUtils();
+  
+	// Pattern looking for <a href="(group-2)"/>(group-3)</a> . . . group-1 is for either " or '
+	static final Pattern XHTML_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*>(.+?)<\\s*/\\s*a\\s*>"); 
+	static final Pattern LAZY_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*/\\s*>"); 
 
 	private HttpUtils() {}
 	
@@ -96,22 +100,26 @@ public class HttpUtils {
   }
 
 	public static List<HttpFile> findLinks(HttpFile file) throws IOException, URISyntaxException {
-		// Pattern looking for <a href="(group-1)"/>(group-2)</a>
-		Pattern linkPattern = Pattern.compile("<\\s*a\\s+href\\s*=\\s*\"(.+?)\"\\s*>(.+?)<\\s*/\\s*a\\s*>"); 
-		Matcher matcher = linkPattern.matcher(HttpUtils.readUrl(connect(file.getLink())));
+		Matcher matcher = XHTML_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
 		List<HttpFile> httpFiles = new ArrayList<HttpFile>();
 		while (matcher.find()) {
-			String link = matcher.group(1);
-			String virtualPath = matcher.group(2);
+			String link = matcher.group(2).trim();
+			String virtualPath = matcher.group(3).trim();
 			URL url = resolveUri(file.getLink().toURI(), link).toURL();
 			httpFiles.add(new HttpFile(link, isDirectory(url, virtualPath), url, file));
 		}
+		matcher = LAZY_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
+		while (matcher.find()) {
+			String link = matcher.group(2).trim();
+			URL url = resolveUri(file.getLink().toURI(), link).toURL();
+			httpFiles.add(new HttpFile(link, isDirectory(url, link), url, file));
+		}
 		return httpFiles;
 	}
-	
+		
 	public static boolean isDirectory(URL url, String virtualPath) throws IOException {
 		try {
-			String mime = mimeTypes.autoResolveContentType(url.toString(),
+			String mime = MIME_TYPES.autoResolveContentType(url.toString(),
 					MimeTypeUtils.readMagicHeader(url.openStream()));
 			return (mime.equals("text/html") && !virtualPath.endsWith(".html"));
 		} catch (Exception e) {

Modified: oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java (original)
+++ oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java Fri Jun  3 21:52:21 2011
@@ -24,6 +24,7 @@ import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.util.List;
+import java.util.regex.Matcher;
 
 //OODT imports
 import org.apache.oodt.cas.protocol.http.HttpFile;
@@ -75,6 +76,79 @@ public class TestHttpUtils extends TestC
 		assertTrue(HttpUtils.checkForRedirection(url, redirectedURL));
 	}
 
+	public void testXhtmlLinkPattern() {
+		// SUCCESS cases
+		Matcher matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\">localhost</a>");
+		assertTrue(matcher.find());
+		assertEquals("\"", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		assertEquals("localhost", matcher.group(3).trim());
+		
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href='http://localhost'>localhost</a>");
+		assertTrue(matcher.find());
+		assertEquals("'", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		assertEquals("localhost", matcher.group(3).trim());
+		
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("< a href = \" http://localhost \" >  localhost < / a >");
+		assertTrue(matcher.find());
+		assertEquals("\"", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		assertEquals("localhost", matcher.group(3).trim());
+		
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("< a href = ' http://localhost ' >  localhost < / a >");
+		assertTrue(matcher.find());
+		assertEquals("'", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		assertEquals("localhost", matcher.group(3).trim());
+		
+		//Should not find case: open with " end with '
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\'>localhost</a>");
+		assertFalse(matcher.find());
+		
+		//Should not find case: open with ' end with "
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\'http://localhost\">localhost</a>");
+		assertFalse(matcher.find());
+		
+		//Should not find case: lazy link pattern
+		matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\"/>");
+		assertFalse(matcher.find());
+	}
+	
+	public void testLazyLinkPattern() {
+		Matcher matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\"http://localhost\"/>");
+		assertTrue(matcher.find());
+		assertEquals("\"", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href='http://localhost'/>");
+		assertTrue(matcher.find());
+		assertEquals("'", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("< a href = \" http://localhost \" / >");
+		assertTrue(matcher.find());
+		assertEquals("\"", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("< a href = ' http://localhost ' / >");
+		assertTrue(matcher.find());
+		assertEquals("'", matcher.group(1).trim());
+		assertEquals("http://localhost", matcher.group(2).trim());
+		
+		//Should not find case: open with " end with '
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\"http://localhost\'/>");
+		assertFalse(matcher.find());
+		
+		//Should not find case: open with ' end with "
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\'http://localhost\"/>");
+		assertFalse(matcher.find());
+		
+		//Should not find case: xhtml link pattern
+		matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href='http://localhost'>localhost</a>");
+		assertFalse(matcher.find());
+	}
+	
 	public void testFindLinks() throws MalformedURLException, IOException, URISyntaxException {
 		URL url = new URL(APACHE_SVN_SITE + PARENT_URL_OF_THIS_TEST);
 		HttpFile parent = new HttpFile(PARENT_URL_OF_THIS_TEST, true, url, null);
@@ -90,4 +164,10 @@ public class TestHttpUtils extends TestC
 		}
 		assertTrue(foundThisTest);
 	}
+	
+	public void testIsDirectory() throws MalformedURLException, IOException {
+		assertTrue(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE + PARENT_URL_OF_THIS_TEST), ""));
+		assertFalse(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE + URL_OF_THIS_TEST), ""));
+		assertTrue(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE), ""));
+	}
 }