You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/12/21 10:12:21 UTC
svn commit: r606140 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/utils/
src/test/java/org/apache/tika/utils/
Author: jukka
Date: Fri Dec 21 01:12:20 2007
New Revision: 606140
URL: http://svn.apache.org/viewvc?rev=606140&view=rev
Log:
TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex
- Patch from Niall Pemberton
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/utils/
incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/pom.xml
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Dec 21 01:12:20 2007
@@ -137,3 +137,6 @@
62. TIKA-104 - Add utility methods to throw IOException with the caused
intialized (jukka & Niall Pemberton)
+
+63. TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex
+ (Niall Pemberton)
Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Dec 21 01:12:20 2007
@@ -195,11 +195,6 @@
<version>1.1.1</version>
</dependency>
<dependency>
- <groupId>oro</groupId>
- <artifactId>oro</artifactId>
- <version>2.0.8</version>
- </dependency>
- <dependency>
<groupId>nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>0.9.5</version>
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Fri Dec 21 01:12:20 2007
@@ -20,7 +20,6 @@
import java.io.InputStream;
import java.io.StringWriter;
-import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.TeeContentHandler;
@@ -38,11 +37,6 @@
*/
public class ParserPostProcessor extends ParserDecorator {
- private static final String LINK_PATTERN =
- "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
- + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
- + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
-
/**
* Creates a post-processing decorator for the given parser.
*
@@ -70,12 +64,8 @@
int length = Math.min(content.length(), 500);
metadata.set("summary", content.substring(0, length));
- try {
- for (String link : RegexUtils.extract(content, LINK_PATTERN)) {
- metadata.add("outlinks", link);
- }
- } catch (MalformedPatternException e) {
- throw new TikaException("Malformed URL pattern", e);
+ for (String link : RegexUtils.extractLinks(content)) {
+ metadata.add("outlinks", link);
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java?rev=606140&r1=606139&r2=606140&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java Fri Dec 21 01:12:20 2007
@@ -17,17 +17,10 @@
package org.apache.tika.utils;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
-
-import org.apache.log4j.Logger;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
/**
* Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
@@ -37,32 +30,37 @@
*/
public class RegexUtils {
- static Logger logger = Logger.getRootLogger();
-
- public static List<String> extract(String content, String regex)
- throws MalformedPatternException {
+ /**
+ * Regex pattern to get URLs within a plain text.
+ *
+ * @see <a
+ * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+ * </a>
+ */
+ private static final String LINKS_REGEX =
+ "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
+ + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
+ + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+
+ private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);
+
+ /**
+ * Extract urls from plain text.
+ *
+ * @param content The plain text content to examine
+ * @return List of urls within found in the plain text
+ */
+ public static List<String> extractLinks(String content) {
+ if (content == null || content.length() == 0) {
+ return Collections.emptyList();
+ }
List<String> extractions = new ArrayList<String>();
- final PatternCompiler cp = new Perl5Compiler();
- final Pattern pattern = cp.compile(regex,
- Perl5Compiler.CASE_INSENSITIVE_MASK
- | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
- final PatternMatcher matcher = new Perl5Matcher();
-
- final PatternMatcherInput input = new PatternMatcherInput(content);
-
- MatchResult result;
- String extractedContent;
-
- while (matcher.contains(input, pattern)) {
- result = matcher.getMatch();
- extractedContent = result.group(0);
- extractions.add(extractedContent);
+ final Matcher matcher = LINKS_PATTERN.matcher(content);
+ while (matcher.find()) {
+ extractions.add(matcher.group());
}
-
return extractions;
}
-
}
Added: incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java?rev=606140&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java Fri Dec 21 01:12:20 2007
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.util.List;
+import junit.framework.TestCase;
+
+/**
+ * Test case for {@link RegexUtils}.
+ *
+ * @version $Revision$ $Date$
+ */
+public class RegexUtilsTest extends TestCase {
+
+ /**
+ * Test {@link RegexUtils#extractLinks(String)} with no links.
+ */
+
+ public void testExtractLinksNone() {
+ List<String> links = null;
+
+ links = RegexUtils.extractLinks(null);
+ assertNotNull(links);
+ assertEquals(0, links.size());
+
+ links = RegexUtils.extractLinks("");
+ assertNotNull(links);
+ assertEquals(0, links.size());
+
+ links = RegexUtils.extractLinks(
+ "Test with no links " +
+ "What about www.google.com");
+ assertNotNull(links);
+ assertEquals(0, links.size());
+ }
+
+
+ /**
+ * Test {@link RegexUtils#extractLinks(String)} for http.
+ */
+ public void testExtractLinksHttp() {
+ List<String> links = RegexUtils.extractLinks(
+ "Test with http://www.nutch.org/index.html is it found? " +
+ "What about www.google.com at http://www.google.de " +
+ "A longer URL could be http://www.sybit.com/solutions/portals.html");
+
+ assertTrue("Url not found!", links.size() == 3);
+ assertEquals("Wrong URL", "http://www.nutch.org/index.html", links.get(0));
+ assertEquals("Wrong URL", "http://www.google.de", links.get(1));
+ assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", links.get(2));
+ }
+
+ /**
+ * Test {@link RegexUtils#extractLinks(String)} for ftp.
+ */
+ public void testExtractLinksFtp() {
+ List<String> links = RegexUtils.extractLinks(
+ "Test with ftp://www.nutch.org is it found? " +
+ "What about www.google.com at ftp://www.google.de");
+
+ assertTrue("Url not found!", links.size() == 2);
+ assertEquals("Wrong URL", "ftp://www.nutch.org", links.get(0));
+ assertEquals("Wrong URL", "ftp://www.google.de", links.get(1));
+ }
+}