You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by ca...@apache.org on 2005/10/28 01:04:10 UTC

svn commit: r328975 - in /maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4: project.xml src/main/org/apache/maven/linkcheck/FileToCheck.java src/main/org/apache/maven/linkcheck/LinkMatcher.java src/test/org/apache/maven/linkcheck/LinkCheckTest.java

Author: carlos
Date: Thu Oct 27 16:04:06 2005
New Revision: 328975

URL: http://svn.apache.org/viewcvs?rev=328975&view=rev
Log:
Use regexps instead of xml parsing MPLINKCHECK-23

Added:
    maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java   (with props)
Modified:
    maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/project.xml
    maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/FileToCheck.java
    maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/test/org/apache/maven/linkcheck/LinkCheckTest.java

Modified: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/project.xml
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/project.xml?rev=328975&r1=328974&r2=328975&view=diff
==============================================================================
--- maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/project.xml (original)
+++ maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/project.xml Thu Oct 27 16:04:06 2005
@@ -180,28 +180,9 @@
       </properties>
     </dependency>
     <dependency>
-      <groupId>dom4j</groupId>
-      <artifactId>dom4j</artifactId>
-      <version>1.4</version>
-      <properties>
-        <comment>This library is already loaded by maven's core. Be careful to use the same version number as in the core.</comment>
-      </properties>
-    </dependency>
-    <dependency>
-      <groupId>jtidy</groupId>
-      <artifactId>jtidy</artifactId>
-      <version>4aug2000r7-dev</version>
-    </dependency>
-    <dependency>
       <groupId>maven</groupId>
       <artifactId>maven</artifactId>
       <version>1.0.2</version>
-    </dependency>
-    <dependency>
-      <jar>js-1.5R4-RC3.jar</jar>
-      <groupId>rhino</groupId>
-      <artifactId>rhino</artifactId>
-      <version>1.5R4-RC3</version>
     </dependency>
   </dependencies>
 </project>

Modified: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/FileToCheck.java
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/FileToCheck.java?rev=328975&r1=328974&r2=328975&view=diff
==============================================================================
--- maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/FileToCheck.java (original)
+++ maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/FileToCheck.java Thu Oct 27 16:04:06 2005
@@ -17,29 +17,19 @@
  * ====================================================================
  */
 
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.PrintWriter;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
-import java.util.TreeSet;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.maven.linkcheck.validation.LinkValidationItem;
 import org.apache.maven.linkcheck.validation.LinkValidationResult;
 import org.apache.maven.linkcheck.validation.LinkValidatorManager;
-import org.dom4j.Document;
-import org.dom4j.Node;
-import org.dom4j.io.DOMReader;
-import org.w3c.tidy.Tidy;
 
 /**
  * @author <a href="mailto:bwalding@apache.org">Ben Walding</a>
@@ -88,7 +78,7 @@
             final Set hrefs;
             try
             {
-                hrefs = getLinks();
+                hrefs = LinkMatcher.match(fileToCheck);
             }
             catch (Throwable e)
             {
@@ -143,34 +133,6 @@
         }
     }
 
-    private Set getLinks() throws FileNotFoundException
-    {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        PrintWriter errOut = new PrintWriter(baos);
-        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileToCheck));
-        try
-        {
-            Tidy tidy = getTidy();
-            tidy.setErrout(errOut);
-            LOG.debug("Processing:" + fileToCheck);
-            org.w3c.dom.Document domDocument = tidy.parseDOM(bin, null);
-
-            // now read a dom4j document from
-            // JTidy's W3C DOM object
-            final DOMReader domReader = new DOMReader();
-            final Document doc = domReader.read(domDocument);
-
-            LOG.debug(baos.toString());
-
-            return findUniqueLinks(doc);
-        }
-        finally
-        {
-            close(bin);
-            close(baos);
-        }
-    }
-
     private void close(InputStream is)
     {
         try
@@ -179,7 +141,7 @@
         }
         catch (Exception e)
         {
-            //Don't really care.
+            // Don't really care.
         }
     }
 
@@ -193,43 +155,6 @@
         {
             //Don't really care.
         }
-    }
-
-    private Set findUniqueLinks(Document doc)
-    {
-        List xpathResults = new LinkedList();
-
-        xpathResults.addAll(doc.selectNodes("//a/@href"));
-        xpathResults.addAll(doc.selectNodes("//img/@src"));
-
-        //<link rel="stylesheet" href="...">
-        xpathResults.addAll(doc.selectNodes("//link/@href"));
-
-        //<script src="http://ar.atwola.com/file/adsWrapper.js">
-        xpathResults.addAll(doc.selectNodes("//script/@src"));
-
-        Set results = new TreeSet();
-        Iterator linkIter = xpathResults.iterator();
-        while (linkIter.hasNext())
-        {
-            Node node = (Node) linkIter.next();
-            String href = node.getText();
-            results.add(href);
-        }
-
-        return results;
-    }
-
-    private Tidy getTidy()
-    {
-        Tidy tidy = new Tidy();
-        tidy.setMakeClean(true);
-        tidy.setXmlTags(true);
-        tidy.setXmlOut(true);
-        tidy.setXHTML(true);
-        tidy.setQuiet(true);
-        tidy.setShowWarnings(false);
-        return tidy;
     }
 
     /**

Added: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java?rev=328975&view=auto
==============================================================================
--- maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java (added)
+++ maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java Thu Oct 27 16:04:06 2005
@@ -0,0 +1,111 @@
+package org.apache.maven.linkcheck;
+
+/* ====================================================================
+ *   Copyright 2001-2004 The Apache Software Foundation.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ * ====================================================================
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Link matcher. Reads the contents of a file and tries to match the following:
+ * <code>
+ * <a href=""....
+ * <link href=""....
+ * <img src=""....
+ * <script src=""....
+ * </code>
+ * 
+ * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a>
+ */
+class LinkMatcher {
+
+	/**
+	 * Regexp for link matching.
+	 */
+	private final static Pattern p = Pattern
+			.compile(
+					"<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?",
+					Pattern.CASE_INSENSITIVE);
+
+	/**
+	 * No need to create a new object each time a file is processed. Just clear
+	 * it.
+	 */
+	private final static Set linkList = new TreeSet();
+
+	/**
+	 * Reads a file and returns a StringBuffer with its contents.
+	 * 
+	 * TODO: Check for encoding issues
+	 * 
+	 * TODO: Better exception handling?
+	 * 
+	 * @param file
+	 *            the file we are reading
+	 * @return a StringBuffer with file's contents.
+	 * @throws IOException
+	 */
+	private static StringBuffer fileToStringBuffer(File file)
+			throws IOException {
+		BufferedReader reader = null;
+		final StringBuffer pageBuffer = new StringBuffer();
+		try {
+			reader = new BufferedReader(new FileReader(file));
+			String line;
+			while ((line = reader.readLine()) != null) {
+				pageBuffer.append(line);
+			}
+		} finally {
+			reader.close();
+		}
+		return pageBuffer;
+	}
+
+	/**
+	 * Performs the actual matching.
+	 * 
+	 * @param file
+	 *            the file to check
+	 * @return a set with all links to check
+	 * @throws IOException
+	 */
+	static Set match(File file) throws IOException {
+		linkList.clear();
+		final Matcher m = p.matcher(fileToStringBuffer(file));
+		String link;
+		while (m.find()) {
+			link = m.group(1).trim();
+			if (link.length() < 1) {
+				continue;
+			} else if (link.toLowerCase().indexOf("javascript") != -1) {
+				continue;
+			}
+			// else if (link.toLowerCase().indexOf("mailto:") != -1) {
+			// continue;
+			// }
+			linkList.add(link);
+		}
+		return linkList;
+	}
+
+}

Propchange: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/main/org/apache/maven/linkcheck/LinkMatcher.java
------------------------------------------------------------------------------
    svn:keywords = "Author Date Id Revision"

Modified: maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/test/org/apache/maven/linkcheck/LinkCheckTest.java
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/test/org/apache/maven/linkcheck/LinkCheckTest.java?rev=328975&r1=328974&r2=328975&view=diff
==============================================================================
--- maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/test/org/apache/maven/linkcheck/LinkCheckTest.java (original)
+++ maven/maven-1/plugins/branches/MAVEN_LINCHECK_1_4/src/test/org/apache/maven/linkcheck/LinkCheckTest.java Thu Oct 27 16:04:06 2005
@@ -59,7 +59,7 @@
       map.put(ftc.getName(), ftc);
     }
 
-    assertEquals("files.size()", 8, lc.getFiles().size());
+    assertEquals("files.size()", 9, lc.getFiles().size());
 
     check(map, "nolink.html", 0);
     check(map, "test-resources/nolink.html", 0);
@@ -67,6 +67,7 @@
     check(map, "test-resources/test1/test2.html", 0);
     check(map, "test1/test1.html", 1);
     check(map, "testA.html", 3);
+    check(map, "testSplit.html", 3);
 
     /* test excludes */
     String fileName = "testExcludes.html";