You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oodt.apache.org by bf...@apache.org on 2011/06/03 23:52:21 UTC
svn commit: r1131258 - in /oodt/branches/protocol/protocol-http/src:
main/java/org/apache/oodt/cas/protocol/http/
main/java/org/apache/oodt/cas/protocol/http/util/
test/org/apache/oodt/cas/protocol/http/util/
Author: bfoster
Date: Fri Jun 3 21:52:21 2011
New Revision: 1131258
URL: http://svn.apache.org/viewvc?rev=1131258&view=rev
Log:
- unit-test updates
---------------
OODT-194
Modified:
oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java
oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java
oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java
Modified: oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java (original)
+++ oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/HttpProtocol.java Fri Jun 3 21:52:21 2011
@@ -37,9 +37,6 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
-import java.util.StringTokenizer;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
/**
*
@@ -52,25 +49,12 @@ import java.util.regex.Pattern;
*/
public class HttpProtocol implements Protocol {
- static String DIR = "dir";
+ private static Map<String, List<HttpFile>> linkChildren = new HashMap<String, List<HttpFile>>();
- static String FILE = "file";
-
- static String IGNORE = "ignore";
-
- static Map<String, List<HttpFile>> linkChildren = new HashMap<String, List<HttpFile>>();
-
- static boolean takeAllFiles = true;
-
- HttpFile parentFile;
-
- boolean abort;
-
- HttpFile currentFile;
-
- boolean isConnected;
-
- URL currentURL;
+ private HttpFile parentFile;
+ private HttpFile currentFile;
+ private boolean isConnected;
+ private URL currentURL;
public HttpProtocol() {
isConnected = false;
@@ -120,7 +104,6 @@ public class HttpProtocol implements Pro
OutputStream out = null;
InputStream in = null;
try {
- this.abort = false;
out = new BufferedOutputStream(new FileOutputStream(toFile));
if (fromFile instanceof HttpFile) {
in = ((HttpFile) fromFile).getLink().openStream();
@@ -131,7 +114,7 @@ public class HttpProtocol implements Pro
byte[] buffer = new byte[1024];
int numRead;
long numWritten = 0;
- while ((numRead = in.read(buffer)) != -1 && !this.abort) {
+ while ((numRead = in.read(buffer)) != -1) {
out.write(buffer, 0, numRead);
numWritten += numRead;
}
@@ -211,111 +194,111 @@ public class HttpProtocol implements Pro
return children;
}
- public static String findLinkInATag(String aTag) {
- // find 'href' attribute
- String find = aTag.substring(aTag.indexOf("href") + 4);
- // USE STRICT FINDING FIRST
- // (['\"])\s*?[(http)(./)(..)/#].+?\\1
- // finds link between ' or ", which starts with one of
- // the following: http, ./, .., /, #
- // these starting possibilities can then be followed any
- // number of characters until the corresponding
- // ' or " is reached.
- String patternRegExp = "(['\"])\\s*?[\\(http\\)\\(\\./\\)\\(\\.\\.\\)/#].+?\\1";
- Pattern linkPattern = Pattern.compile(patternRegExp);
- Matcher linkMatch = linkPattern.matcher(find);
- if (linkMatch.find())
- find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
- else {
- // RELAX FINDING SOME
- patternRegExp = "(['\"])\\s*?[^./].+?\\1";
- linkPattern = Pattern.compile(patternRegExp);
- linkMatch = linkPattern.matcher(find);
- if (linkMatch.find())
- find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
- else {
- // EXTREMELY RELAX FINDING
- patternRegExp = "[^\"='/>\\s]+?[^\\s>\"']*?";
- linkPattern = Pattern.compile(patternRegExp);
- linkMatch = linkPattern.matcher(find);
- if (linkMatch.find())
- find = find.substring(linkMatch.start(), linkMatch.end());
- else {
- return null;
- }
- }
- }
- return find;
- }
-
- public static String createLinkFromHref(HttpFile parent, String href) {
- if (!href.startsWith("http")) {
- String link = parent.getLink().toExternalForm();
- if (href.startsWith("..")) {
- int index = link.substring(0, link.lastIndexOf("/")).lastIndexOf("/");
- href = (index < 7) ? link + href.substring(2) : link.substring(0, link
- .substring(0, link.lastIndexOf("/")).lastIndexOf("/"))
- + href.substring(2);
- } else if (href.startsWith("./")) {
- int index = link.lastIndexOf("/");
- href = (index < 7) ? link + href.substring(1) : link
- .substring(0, index)
- + href.substring(1);
- } else if (href.startsWith("/")) {
- URL url = parent.getLink();
- href = url.getProtocol() + "://" + url.getHost() + href;
- } else {
- // find the last / in current link
- int index = link.lastIndexOf("/");
- // (index < 7) checks if in the current link, "/" only exists
- // in the protocol section of link (i.e. http://jpl.nasa.gov)
- href = (index < 7) ? link + "/" + href : link.substring(0, index) + "/"
- + href;
- }
- }
-
- // remove "/" at end of link
- if (href.endsWith("/"))
- href = href.substring(0, href.length() - 1);
- href = href.trim();
-
- return href;
- }
-
- public ProtocolFile getProtocolFileFor(String path, boolean isDir)
- throws ProtocolException {
- try {
- StringTokenizer st = new StringTokenizer(path, "/ ");
- HttpFile curPath = this.parentFile;
- // System.out.println(parentPath);
- if (st.hasMoreTokens()) {
- do {
- String token = st.nextToken();
- List<HttpFile> children = this.parseLink(curPath);
- for (HttpFile pFile : children) {
- if (pFile.getName().equals(token)) {
- // System.out.println("token " + token + " " +
- // pFile);
- curPath = pFile;
- continue;
- }
- }
- } while (st.hasMoreTokens());
- if (curPath.equals(this.parentFile))
- return new HttpFile(path, isDir, new URL("http://"
- + this.getSite().getHost() + path), curPath);
- }
- return curPath;
- } catch (Exception e) {
- throw new ProtocolException("Failed to get ProtocolPath for " + path);
- }
- }
+// public static String findLinkInATag(String aTag) {
+// // find 'href' attribute
+// String find = aTag.substring(aTag.indexOf("href") + 4);
+// // USE STRICT FINDING FIRST
+// // (['\"])\s*?[(http)(./)(..)/#].+?\\1
+// // finds link between ' or ", which starts with one of
+// // the following: http, ./, .., /, #
+// // these starting possibilities can then be followed any
+// // number of characters until the corresponding
+// // ' or " is reached.
+// String patternRegExp = "(['\"])\\s*?[\\(http\\)\\(\\./\\)\\(\\.\\.\\)/#].+?\\1";
+// Pattern linkPattern = Pattern.compile(patternRegExp);
+// Matcher linkMatch = linkPattern.matcher(find);
+// if (linkMatch.find())
+// find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
+// else {
+// // RELAX FINDING SOME
+// patternRegExp = "(['\"])\\s*?[^./].+?\\1";
+// linkPattern = Pattern.compile(patternRegExp);
+// linkMatch = linkPattern.matcher(find);
+// if (linkMatch.find())
+// find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
+// else {
+// // EXTREMELY RELAX FINDING
+// patternRegExp = "[^\"='/>\\s]+?[^\\s>\"']*?";
+// linkPattern = Pattern.compile(patternRegExp);
+// linkMatch = linkPattern.matcher(find);
+// if (linkMatch.find())
+// find = find.substring(linkMatch.start(), linkMatch.end());
+// else {
+// return null;
+// }
+// }
+// }
+// return find;
+// }
+//
+// public static String createLinkFromHref(HttpFile parent, String href) {
+// if (!href.startsWith("http")) {
+// String link = parent.getLink().toExternalForm();
+// if (href.startsWith("..")) {
+// int index = link.substring(0, link.lastIndexOf("/")).lastIndexOf("/");
+// href = (index < 7) ? link + href.substring(2) : link.substring(0, link
+// .substring(0, link.lastIndexOf("/")).lastIndexOf("/"))
+// + href.substring(2);
+// } else if (href.startsWith("./")) {
+// int index = link.lastIndexOf("/");
+// href = (index < 7) ? link + href.substring(1) : link
+// .substring(0, index)
+// + href.substring(1);
+// } else if (href.startsWith("/")) {
+// URL url = parent.getLink();
+// href = url.getProtocol() + "://" + url.getHost() + href;
+// } else {
+// // find the last / in current link
+// int index = link.lastIndexOf("/");
+// // (index < 7) checks if in the current link, "/" only exists
+// // in the protocol section of link (i.e. http://jpl.nasa.gov)
+// href = (index < 7) ? link + "/" + href : link.substring(0, index) + "/"
+// + href;
+// }
+// }
+//
+// // remove "/" at end of link
+// if (href.endsWith("/"))
+// href = href.substring(0, href.length() - 1);
+// href = href.trim();
+//
+// return href;
+// }
+//
+// public ProtocolFile getProtocolFileFor(String path, boolean isDir)
+// throws ProtocolException {
+// try {
+// StringTokenizer st = new StringTokenizer(path, "/ ");
+// HttpFile curPath = this.parentFile;
+// // System.out.println(parentPath);
+// if (st.hasMoreTokens()) {
+// do {
+// String token = st.nextToken();
+// List<HttpFile> children = this.parseLink(curPath);
+// for (HttpFile pFile : children) {
+// if (pFile.getName().equals(token)) {
+// // System.out.println("token " + token + " " +
+// // pFile);
+// curPath = pFile;
+// continue;
+// }
+// }
+// } while (st.hasMoreTokens());
+// if (curPath.equals(this.parentFile))
+// return new HttpFile(path, isDir, new URL("http://"
+// + this.getSite().getHost() + path), curPath);
+// }
+// return curPath;
+// } catch (Exception e) {
+// throw new ProtocolException("Failed to get ProtocolPath for " + path);
+// }
+// }
public void delete(ProtocolFile file) {}
- private URL getSite() {
- return currentURL;
- }
+// private URL getSite() {
+// return currentURL;
+// }
public static void main(String[] args) throws Exception {
String urlString = null, downloadToDir = null;
Modified: oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java (original)
+++ oodt/branches/protocol/protocol-http/src/main/java/org/apache/oodt/cas/protocol/http/util/HttpUtils.java Fri Jun 3 21:52:21 2011
@@ -42,7 +42,11 @@ import org.apache.oodt.cas.protocol.http
*/
public class HttpUtils {
- private static MimeTypeUtils mimeTypes = new MimeTypeUtils();
+ static final MimeTypeUtils MIME_TYPES = new MimeTypeUtils();
+
+ // Pattern looking for <a href="(group-2)"/>(group-3)</a> . . . group-1 is for either " or '
+ static final Pattern XHTML_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*>(.+?)<\\s*/\\s*a\\s*>");
+ static final Pattern LAZY_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*/\\s*>");
private HttpUtils() {}
@@ -96,22 +100,26 @@ public class HttpUtils {
}
public static List<HttpFile> findLinks(HttpFile file) throws IOException, URISyntaxException {
- // Pattern looking for <a href="(group-1)"/>(group-2)</a>
- Pattern linkPattern = Pattern.compile("<\\s*a\\s+href\\s*=\\s*\"(.+?)\"\\s*>(.+?)<\\s*/\\s*a\\s*>");
- Matcher matcher = linkPattern.matcher(HttpUtils.readUrl(connect(file.getLink())));
+ Matcher matcher = XHTML_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
List<HttpFile> httpFiles = new ArrayList<HttpFile>();
while (matcher.find()) {
- String link = matcher.group(1);
- String virtualPath = matcher.group(2);
+ String link = matcher.group(2).trim();
+ String virtualPath = matcher.group(3).trim();
URL url = resolveUri(file.getLink().toURI(), link).toURL();
httpFiles.add(new HttpFile(link, isDirectory(url, virtualPath), url, file));
}
+ matcher = LAZY_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
+ while (matcher.find()) {
+ String link = matcher.group(2).trim();
+ URL url = resolveUri(file.getLink().toURI(), link).toURL();
+ httpFiles.add(new HttpFile(link, isDirectory(url, link), url, file));
+ }
return httpFiles;
}
-
+
public static boolean isDirectory(URL url, String virtualPath) throws IOException {
try {
- String mime = mimeTypes.autoResolveContentType(url.toString(),
+ String mime = MIME_TYPES.autoResolveContentType(url.toString(),
MimeTypeUtils.readMagicHeader(url.openStream()));
return (mime.equals("text/html") && !virtualPath.endsWith(".html"));
} catch (Exception e) {
Modified: oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java
URL: http://svn.apache.org/viewvc/oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java?rev=1131258&r1=1131257&r2=1131258&view=diff
==============================================================================
--- oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java (original)
+++ oodt/branches/protocol/protocol-http/src/test/org/apache/oodt/cas/protocol/http/util/TestHttpUtils.java Fri Jun 3 21:52:21 2011
@@ -24,6 +24,7 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
+import java.util.regex.Matcher;
//OODT imports
import org.apache.oodt.cas.protocol.http.HttpFile;
@@ -75,6 +76,79 @@ public class TestHttpUtils extends TestC
assertTrue(HttpUtils.checkForRedirection(url, redirectedURL));
}
+ public void testXhtmlLinkPattern() {
+ // SUCCESS cases
+ Matcher matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\">localhost</a>");
+ assertTrue(matcher.find());
+ assertEquals("\"", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+ assertEquals("localhost", matcher.group(3).trim());
+
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href='http://localhost'>localhost</a>");
+ assertTrue(matcher.find());
+ assertEquals("'", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+ assertEquals("localhost", matcher.group(3).trim());
+
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("< a href = \" http://localhost \" > localhost < / a >");
+ assertTrue(matcher.find());
+ assertEquals("\"", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+ assertEquals("localhost", matcher.group(3).trim());
+
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("< a href = ' http://localhost ' > localhost < / a >");
+ assertTrue(matcher.find());
+ assertEquals("'", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+ assertEquals("localhost", matcher.group(3).trim());
+
+ //Should not find case: open with " end with '
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\'>localhost</a>");
+ assertFalse(matcher.find());
+
+ //Should not find case: open with ' end with "
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\'http://localhost\">localhost</a>");
+ assertFalse(matcher.find());
+
+ //Should not find case: lazy link pattern
+ matcher = HttpUtils.XHTML_LINK_PATTERN.matcher("<a href=\"http://localhost\"/>");
+ assertFalse(matcher.find());
+ }
+
+ public void testLazyLinkPattern() {
+ Matcher matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\"http://localhost\"/>");
+ assertTrue(matcher.find());
+ assertEquals("\"", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href='http://localhost'/>");
+ assertTrue(matcher.find());
+ assertEquals("'", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("< a href = \" http://localhost \" / >");
+ assertTrue(matcher.find());
+ assertEquals("\"", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("< a href = ' http://localhost ' / >");
+ assertTrue(matcher.find());
+ assertEquals("'", matcher.group(1).trim());
+ assertEquals("http://localhost", matcher.group(2).trim());
+
+ //Should not find case: open with " end with '
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\"http://localhost\'/>");
+ assertFalse(matcher.find());
+
+ //Should not find case: open with ' end with "
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href=\'http://localhost\"/>");
+ assertFalse(matcher.find());
+
+ //Should not find case: xhtml link pattern
+ matcher = HttpUtils.LAZY_LINK_PATTERN.matcher("<a href='http://localhost'>localhost</a>");
+ assertFalse(matcher.find());
+ }
+
public void testFindLinks() throws MalformedURLException, IOException, URISyntaxException {
URL url = new URL(APACHE_SVN_SITE + PARENT_URL_OF_THIS_TEST);
HttpFile parent = new HttpFile(PARENT_URL_OF_THIS_TEST, true, url, null);
@@ -90,4 +164,10 @@ public class TestHttpUtils extends TestC
}
assertTrue(foundThisTest);
}
+
+ public void testIsDirectory() throws MalformedURLException, IOException {
+ assertTrue(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE + PARENT_URL_OF_THIS_TEST), ""));
+ assertFalse(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE + URL_OF_THIS_TEST), ""));
+ assertTrue(HttpUtils.isDirectory(new URL(APACHE_SVN_SITE), ""));
+ }
}