You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/24 21:17:31 UTC
svn commit: r1627397 - in /tika/trunk/tika-example: ./
src/main/java/org/apache/tika/example/
src/test/java/org/apache/tika/example/ src/test/resources/
src/test/resources/org/ src/test/resources/org/apache/
src/test/resources/org/apache/tika/ src/test...
Author: tpalsulich
Date: Wed Sep 24 19:17:31 2014
New Revision: 1627397
URL: http://svn.apache.org/r1627397
Log:
TIKA-1420, create an example of a PhoneNumberContentExtractor.
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
tika/trunk/tika-example/src/test/resources/
tika/trunk/tika-example/src/test/resources/org/
tika/trunk/tika-example/src/test/resources/org/apache/
tika/trunk/tika-example/src/test/resources/org/apache/tika/
tika/trunk/tika-example/src/test/resources/org/apache/tika/example/
tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt (with props)
Modified:
tika/trunk/tika-example/pom.xml
Modified: tika/trunk/tika-example/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1627397&r1=1627396&r2=1627397&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Wed Sep 24 19:17:31 2014
@@ -66,6 +66,12 @@
<artifactId>tika-translate</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.googlecode.libphonenumber</groupId>
+ <artifactId>libphonenumber</artifactId>
+ <version>6.2.2</version>
+ </dependency>
+
<dependency>
<groupId>org.apache.tika</groupId>
Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java?rev=1627397&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java Wed Sep 24 19:17:31 2014
@@ -0,0 +1,54 @@
+package org.apache.tika.example;
+
+import com.google.i18n.phonenumbers.PhoneNumberMatch;
+import com.google.i18n.phonenumbers.PhoneNumberUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.Iterator;
+
+public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
+ protected final PhoneNumberUtil phoneUtil;
+ private Metadata metadata;
+ private static final String PHONE_NUMBERS = "phonenumbers";
+
+ /**
+ * Creates a decorator for the given SAX event handler.
+ *
+ * @param handler SAX event handler to be decorated
+ */
+ public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) {
+ super(handler);
+ phoneUtil = PhoneNumberUtil.getInstance();
+ this.metadata = metadata;
+ }
+
+ /**
+ * Creates a decorator that by default forwards incoming SAX events to
+ * a dummy content handler that simply ignores all the events. Subclasses
+ * should use the {@link #setContentHandler(ContentHandler)} method to
+ * switch to a more usable underlying content handler.
+ */
+ protected PhoneExtractingContentHandler() {
+ this(new DefaultHandler(), new Metadata());
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ try {
+ String text = new String(Arrays.copyOfRange(ch, start, start + length));
+ for (PhoneNumberMatch match : phoneUtil.findNumbers(text, "US")) {
+ metadata.add(PHONE_NUMBERS, match.number().toString());
+ }
+ super.characters(ch, start, length);
+ } catch (SAXException e) {
+ handleException(e);
+ }
+ }
+}
Added: tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java?rev=1627397&view=auto
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (added)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java Wed Sep 24 19:17:31 2014
@@ -0,0 +1,42 @@
+package org.apache.tika.example;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.apache.tika.TikaTest.assertContains;
+
+/**
+ * Test class for the {@link org.apache.tika.example.PhoneExtractingContentHandler}
+ * class. This demonstrates how to parse a document and retrieve any phone numbers
+ * found within.
+ *
+ * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers".
+ * You can get an array of phone numbers by calling metadata.getValues("phonenumber").
+ */
+public class PhoneExtractingContentHandlerTest {
+ @Test
+ public void testExtractPhoneNumbers() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // The PhoneExtractingContentHandler will examine any characters for phone numbers, before passing them
+ // to the underlying Handler.
+ PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+ InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ finally {
+ stream.close();
+ }
+ String[] phoneNumbers = metadata.getValues("phonenumbers");
+ assertContains("9498888888", phoneNumbers[0]);
+ assertContains("9497777777", phoneNumbers[1]);
+ assertContains("9496666666", phoneNumbers[2]);
+ }
+}
Added: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt?rev=1627397&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream