You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/12/21 10:12:21 UTC

svn commit: r606140 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/utils/ src/test/java/org/apache/tika/utils/

Author: jukka
Date: Fri Dec 21 01:12:20 2007
New Revision: 606140

URL: http://svn.apache.org/viewvc?rev=606140&view=rev
Log:
TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex
    - Patch from Niall Pemberton

Added:
    incubator/tika/trunk/src/test/java/org/apache/tika/utils/
    incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/pom.xml
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Dec 21 01:12:20 2007
@@ -137,3 +137,6 @@
  
 62. TIKA-104 - Add utility methods to throw IOException with the caused
                intialized (jukka & Niall Pemberton)
+
+63. TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex
+               (Niall Pemberton)

Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Dec 21 01:12:20 2007
@@ -195,11 +195,6 @@
       <version>1.1.1</version>
     </dependency>
     <dependency>
-      <groupId>oro</groupId>
-      <artifactId>oro</artifactId>
-      <version>2.0.8</version>
-    </dependency>
-    <dependency>
       <groupId>nekohtml</groupId>
       <artifactId>nekohtml</artifactId>
       <version>0.9.5</version>

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Fri Dec 21 01:12:20 2007
@@ -20,7 +20,6 @@
 import java.io.InputStream;
 import java.io.StringWriter;
 
-import org.apache.oro.text.regex.MalformedPatternException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.TeeContentHandler;
@@ -38,11 +37,6 @@
  */
 public class ParserPostProcessor extends ParserDecorator {
 
-    private static final String LINK_PATTERN =
-        "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
-        + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
-        + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
-
     /**
      * Creates a post-processing decorator for the given parser.
      *
@@ -70,12 +64,8 @@
         int length = Math.min(content.length(), 500);
         metadata.set("summary", content.substring(0, length));
 
-        try {
-            for (String link : RegexUtils.extract(content, LINK_PATTERN)) {
-                metadata.add("outlinks", link);
-            }
-        } catch (MalformedPatternException e) {
-            throw new TikaException("Malformed URL pattern", e);
+        for (String link : RegexUtils.extractLinks(content)) {
+            metadata.add("outlinks", link);
         }
     }
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java Fri Dec 21 01:12:20 2007
@@ -17,17 +17,10 @@
 package org.apache.tika.utils;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
@@ -37,32 +30,37 @@
  */
 public class RegexUtils {
 
-    static Logger logger = Logger.getRootLogger();
-
-    public static List<String> extract(String content, String regex)
-            throws MalformedPatternException {
+    /**
+     * Regex pattern to get URLs within a plain text.
+     * 
+     * @see <a
+     *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+     *      </a>
+     */
+    private static final String LINKS_REGEX =
+        "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
+        + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
+        + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+    
+    private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);
+
+    /**
+     * Extract urls from plain text.
+     *
+     * @param content The plain text content to examine
+     * @return List of urls within found in the plain text
+     */
+    public static List<String> extractLinks(String content) {
+        if (content == null || content.length() == 0) {
+            return Collections.emptyList();
+        }
 
         List<String> extractions = new ArrayList<String>();
-        final PatternCompiler cp = new Perl5Compiler();
-        final Pattern pattern = cp.compile(regex,
-                Perl5Compiler.CASE_INSENSITIVE_MASK
-                        | Perl5Compiler.READ_ONLY_MASK
-                        | Perl5Compiler.MULTILINE_MASK);
-        final PatternMatcher matcher = new Perl5Matcher();
-
-        final PatternMatcherInput input = new PatternMatcherInput(content);
-
-        MatchResult result;
-        String extractedContent;
-
-        while (matcher.contains(input, pattern)) {
-            result = matcher.getMatch();
-            extractedContent = result.group(0);
-            extractions.add(extractedContent);
+        final Matcher matcher = LINKS_PATTERN.matcher(content);
+        while (matcher.find()) {
+            extractions.add(matcher.group());
         }
-
         return extractions;
 
     }
-
 }

Added: incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java?rev=606140&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java Fri Dec 21 01:12:20 2007
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.util.List;
+import junit.framework.TestCase;
+
+/**
+ * Test case for {@link RegexUtils}.
+ *
+ * @version $Revision$ $Date$
+ */
+public class RegexUtilsTest extends TestCase {
+
+    /** 
+     * Test {@link RegexUtils#extractLinks(String)} with no links.
+     */
+
+    public void testExtractLinksNone() {
+        List<String> links = null;
+                
+        links = RegexUtils.extractLinks(null);
+        assertNotNull(links);
+        assertEquals(0, links.size());
+        
+        links = RegexUtils.extractLinks("");
+        assertNotNull(links);
+        assertEquals(0, links.size());
+        
+        links = RegexUtils.extractLinks(
+                "Test with no links " +
+                "What about www.google.com");
+        assertNotNull(links);
+        assertEquals(0, links.size());
+    }
+      
+
+    /** 
+     * Test {@link RegexUtils#extractLinks(String)} for http.
+     */
+    public void testExtractLinksHttp() {
+        List<String> links = RegexUtils.extractLinks(
+                "Test with http://www.nutch.org/index.html is it found? " +
+                "What about www.google.com at http://www.google.de " +
+                "A longer URL could be http://www.sybit.com/solutions/portals.html");
+          
+        assertTrue("Url not found!", links.size() == 3);
+        assertEquals("Wrong URL", "http://www.nutch.org/index.html", links.get(0));
+        assertEquals("Wrong URL", "http://www.google.de", links.get(1));
+        assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", links.get(2));
+    }
+        
+    /** 
+     * Test {@link RegexUtils#extractLinks(String)} for ftp.
+     */
+    public void testExtractLinksFtp() {
+        List<String> links = RegexUtils.extractLinks(
+                "Test with ftp://www.nutch.org is it found? " +
+                "What about www.google.com at ftp://www.google.de");
+         
+        assertTrue("Url not found!", links.size() == 2);
+        assertEquals("Wrong URL", "ftp://www.nutch.org", links.get(0));
+        assertEquals("Wrong URL", "ftp://www.google.de", links.get(1));
+    }
+}