You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@maven.apache.org by ah...@apache.org on 2006/01/15 01:36:26 UTC

svn commit: r369134 - in /maven/maven-1/plugins/trunk/linkcheck: project.xml src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java xdocs/changes.xml

Author: aheritier
Date: Sat Jan 14 16:36:18 2006
New Revision: 369134

URL: http://svn.apache.org/viewcvs?rev=369134&view=rev
Log:
PR: MPLINKCHECK-20, MPLINKCHECK-23
Submitted by: Ignacio G. Mac Dowell
Reviewed by: aheritier
Improve performance getting rid of jtidy dependency via regexps.
StackOverflowError processing apidocs/index-all.html.

Added:
    maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java
Modified:
    maven/maven-1/plugins/trunk/linkcheck/project.xml
    maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java
    maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml

Modified: maven/maven-1/plugins/trunk/linkcheck/project.xml
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/project.xml?rev=369134&r1=369133&r2=369134&view=diff
==============================================================================
--- maven/maven-1/plugins/trunk/linkcheck/project.xml (original)
+++ maven/maven-1/plugins/trunk/linkcheck/project.xml Sat Jan 14 16:36:18 2006
@@ -201,28 +201,9 @@
       </properties>
     </dependency>
     <dependency>
-      <groupId>dom4j</groupId>
-      <artifactId>dom4j</artifactId>
-      <version>1.4</version>
-      <properties>
-        <comment>This library is already loaded by maven's core. Be careful to use the same version number as in the core.</comment>
-      </properties>
-    </dependency>
-    <dependency>
-      <groupId>jtidy</groupId>
-      <artifactId>jtidy</artifactId>
-      <version>4aug2000r7-dev</version>
-    </dependency>
-    <dependency>
       <groupId>maven</groupId>
       <artifactId>maven</artifactId>
       <version>1.0.2</version>
-    </dependency>
-    <dependency>
-      <jar>js-1.5R4-RC3.jar</jar>
-      <groupId>rhino</groupId>
-      <artifactId>rhino</artifactId>
-      <version>1.5R4-RC3</version>
     </dependency>
   </dependencies>
 </project>

Modified: maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java?rev=369134&r1=369133&r2=369134&view=diff
==============================================================================
--- maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java (original)
+++ maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java Sat Jan 14 16:36:18 2006
@@ -17,29 +17,19 @@
  * ====================================================================
  */
 
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.PrintWriter;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
-import java.util.TreeSet;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.maven.plugin.linkcheck.validation.LinkValidationItem;
 import org.apache.maven.plugin.linkcheck.validation.LinkValidationResult;
 import org.apache.maven.plugin.linkcheck.validation.LinkValidatorManager;
-import org.dom4j.Document;
-import org.dom4j.Node;
-import org.dom4j.io.DOMReader;
-import org.w3c.tidy.Tidy;
 
 /**
  * @author <a href="mailto:bwalding@apache.org">Ben Walding</a>
@@ -73,30 +63,6 @@
 
     private int unsuccessful;
 
-    private Set getLinks()
-        throws FileNotFoundException
-    {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        PrintWriter errOut = new PrintWriter( baos );
-        BufferedInputStream bin = new BufferedInputStream( new FileInputStream( fileToCheck ) );
-        try
-        {
-            Tidy tidy = getTidy();
-            tidy.setErrout( errOut );
-            org.w3c.dom.Document domDocument = tidy.parseDOM( bin, null );
-            // now read a dom4j document from
-            // JTidy's W3C DOM object
-            final DOMReader domReader = new DOMReader();
-            final Document doc = domReader.read( domDocument );
-            return findUniqueLinks( doc );
-        }
-        finally
-        {
-            close( bin );
-            close( baos );
-        }
-    }
-
     /**
      * Returns the message.
      * @return String
@@ -172,7 +138,7 @@
             final Set hrefs;
             try
             {
-                hrefs = getLinks();
+                hrefs = LinkMatcher.match( fileToCheck );
             }
             catch ( Throwable t )
             {
@@ -259,69 +225,4 @@
     {
         this.links.add( lcr );
     }
-
-    private void close( InputStream is )
-    {
-        try
-        {
-            is.close();
-        }
-        catch ( Exception e )
-        {
-            //Don't really care.
-        }
-    }
-
-    private void close( OutputStream os )
-    {
-        try
-        {
-            os.close();
-        }
-        catch ( Exception e )
-        {
-            //Don't really care.
-        }
-    }
-
-    private Set findUniqueLinks( Document doc )
-    {
-        List xpathResults = new LinkedList();
-
-        xpathResults.addAll( doc.selectNodes( "//a/@href" ) );
-        xpathResults.addAll( doc.selectNodes( "//img/@src" ) );
-
-        xpathResults.addAll( doc.selectNodes( "//link/@href" ) );
-
-        xpathResults.addAll( doc.selectNodes( "//script/@src" ) );
-
-        Set results = new TreeSet();
-        Iterator linkIter = xpathResults.iterator();
-        Node node = null;
-        String href = null;
-        while ( linkIter.hasNext() )
-        {
-            node = (Node) linkIter.next();
-            href = node.getText();
-            results.add( href );
-        }
-        xpathResults = null;
-        linkIter = null;
-        node = null;
-        href = null;
-        return results;
-    }
-
-    private Tidy getTidy()
-    {
-        Tidy tidy = new Tidy();
-        tidy.setMakeClean( true );
-        tidy.setXmlTags( true );
-        tidy.setXmlOut( true );
-        tidy.setXHTML( true );
-        tidy.setQuiet( true );
-        tidy.setShowWarnings( false );
-        return tidy;
-    }
-
-}
\ No newline at end of file
+}

Added: maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java?rev=369134&view=auto
==============================================================================
--- maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java (added)
+++ maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java Sat Jan 14 16:36:18 2006
@@ -0,0 +1,122 @@
+package org.apache.maven.plugin.linkcheck;
+
+/* ====================================================================
+ *   Copyright 2001-2006 The Apache Software Foundation.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ * ====================================================================
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Link matcher. Reads the contents of a file and tries to match the following:
+ * <code>
+ * <a href=""....
+ * <link href=""....
+ * <img src=""....
+ * <script src=""....
+ * </code>
+ * 
+ * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a>
+ */
+class LinkMatcher
+{
+
+    /**
+     * Regexp for link matching.
+     */
+    private final static Pattern p = Pattern
+        .compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?",
+                  Pattern.CASE_INSENSITIVE );
+
+    /**
+     * No need to create a new object each time a file is processed. Just clear
+     * it.
+     */
+    private final static Set linkList = new TreeSet();
+
+    /**
+     * Reads a file and returns a StringBuffer with its contents.
+     * 
+     * TODO: Check for encoding issues
+     * 
+     * TODO: Better exception handling?
+     * 
+     * @param file
+     *            the file we are reading
+     * @return a StringBuffer with file's contents.
+     * @throws IOException
+     */
+    private static StringBuffer fileToStringBuffer( File file )
+        throws IOException
+    {
+        BufferedReader reader = null;
+        final StringBuffer pageBuffer = new StringBuffer();
+        try
+        {
+            reader = new BufferedReader( new FileReader( file ) );
+            String line;
+            while ( ( line = reader.readLine() ) != null )
+            {
+                pageBuffer.append( line );
+            }
+        }
+        finally
+        {
+            reader.close();
+        }
+        return pageBuffer;
+    }
+
+    /**
+     * Performs the actual matching.
+     * 
+     * @param file
+     *            the file to check
+     * @return a set with all links to check
+     * @throws IOException
+     */
+    static Set match( File file )
+        throws IOException
+    {
+        linkList.clear();
+        final Matcher m = p.matcher( fileToStringBuffer( file ) );
+        String link;
+        while ( m.find() )
+        {
+            link = m.group( 1 ).trim();
+            if ( link.length() < 1 )
+            {
+                continue;
+            }
+            else if ( link.toLowerCase().indexOf( "javascript" ) != -1 )
+            {
+                continue;
+            }
+            // else if (link.toLowerCase().indexOf("mailto:") != -1) {
+            // continue;
+            // }
+            linkList.add( link );
+        }
+        return linkList;
+    }
+
+}

Modified: maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml
URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml?rev=369134&r1=369133&r2=369134&view=diff
==============================================================================
--- maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml (original)
+++ maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml Sat Jan 14 16:36:18 2006
@@ -26,6 +26,8 @@
   </properties>
   <body>
     <release version="1.4-SNAPSHOT" date="in SVN">
+      <action dev="aheritier" type="update" issue="MPLINKCHECK-23" due-to="Ignacio G. Mac Dowell">Improve performance getting rid of jtidy dependency via regexps.</action>
+      <action dev="aheritier" type="fix" issue="MPLINKCHECK-20" due-to="Ignacio G. Mac Dowell">StackOverflowError processing apidocs/index-all.html.</action>    
       <action dev="aheritier" type="add">If maven is in offline mode the report doesn't test external urls. A warning is displayed in the report.</action>
       <action dev="aheritier" type="update" issue="MPLINKCHECK-10">"Moved Permanently" sites are reported as a warning and not as an error.</action>
       <action dev="aheritier" type="update" issue="MPLINKCHECK-24">Speed and stability enhancement [better usage of httpClient].</action>