You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/24 21:17:31 UTC

svn commit: r1627397 - in /tika/trunk/tika-example: ./ src/main/java/org/apache/tika/example/ src/test/java/org/apache/tika/example/ src/test/resources/ src/test/resources/org/ src/test/resources/org/apache/ src/test/resources/org/apache/tika/ src/test...

Author: tpalsulich
Date: Wed Sep 24 19:17:31 2014
New Revision: 1627397

URL: http://svn.apache.org/r1627397
Log:
TIKA-1420, create an example of a PhoneNumberContentExtractor.

Added:
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
    tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
    tika/trunk/tika-example/src/test/resources/
    tika/trunk/tika-example/src/test/resources/org/
    tika/trunk/tika-example/src/test/resources/org/apache/
    tika/trunk/tika-example/src/test/resources/org/apache/tika/
    tika/trunk/tika-example/src/test/resources/org/apache/tika/example/
    tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt   (with props)
Modified:
    tika/trunk/tika-example/pom.xml

Modified: tika/trunk/tika-example/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1627397&r1=1627396&r2=1627397&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Wed Sep 24 19:17:31 2014
@@ -66,6 +66,12 @@
             <artifactId>tika-translate</artifactId>
             <version>${project.version}</version>
         </dependency>
+        <dependency>
+            <groupId>com.googlecode.libphonenumber</groupId>
+            <artifactId>libphonenumber</artifactId>
+            <version>6.2.2</version>
+        </dependency>
+
 
         <dependency>
             <groupId>org.apache.tika</groupId>

Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java?rev=1627397&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java Wed Sep 24 19:17:31 2014
@@ -0,0 +1,54 @@
+package org.apache.tika.example;
+
+import com.google.i18n.phonenumbers.PhoneNumberMatch;
+import com.google.i18n.phonenumbers.PhoneNumberUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.Iterator;
+
+public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
+    protected final PhoneNumberUtil phoneUtil;
+    private Metadata metadata;
+    private static final String PHONE_NUMBERS = "phonenumbers";
+
+    /**
+     * Creates a decorator for the given SAX event handler.
+     *
+     * @param handler SAX event handler to be decorated
+     */
+    public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) {
+        super(handler);
+        phoneUtil = PhoneNumberUtil.getInstance();
+        this.metadata = metadata;
+    }
+
+    /**
+     * Creates a decorator that by default forwards incoming SAX events to
+     * a dummy content handler that simply ignores all the events. Subclasses
+     * should use the {@link #setContentHandler(ContentHandler)} method to
+     * switch to a more usable underlying content handler.
+     */
+    protected PhoneExtractingContentHandler() {
+        this(new DefaultHandler(), new Metadata());
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        try {
+            String text = new String(Arrays.copyOfRange(ch, start, start + length));
+            for (PhoneNumberMatch match : phoneUtil.findNumbers(text, "US")) {
+                metadata.add(PHONE_NUMBERS, match.number().toString());
+            }
+            super.characters(ch, start, length);
+        } catch (SAXException e) {
+            handleException(e);
+        }
+    }
+}

Added: tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java?rev=1627397&view=auto
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (added)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java Wed Sep 24 19:17:31 2014
@@ -0,0 +1,42 @@
+package org.apache.tika.example;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.apache.tika.TikaTest.assertContains;
+
+/**
+ * Test class for the {@link org.apache.tika.example.PhoneExtractingContentHandler}
+ * class. This demonstrates how to parse a document and retrieve any phone numbers
+ * found within.
+ *
+ * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers".
+ * You can get an array of phone numbers by calling metadata.getValues("phonenumber").
+ */
+public class PhoneExtractingContentHandlerTest {
+    @Test
+    public void testExtractPhoneNumbers() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // The PhoneExtractingContentHandler will examine any characters for phone numbers, before passing them
+        // to the underlying Handler.
+        PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+        InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+        finally {
+            stream.close();
+        }
+        String[] phoneNumbers = metadata.getValues("phonenumbers");
+        assertContains("9498888888", phoneNumbers[0]);
+        assertContains("9497777777", phoneNumbers[1]);
+        assertContains("9496666666", phoneNumbers[2]);
+    }
+}

Added: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt?rev=1627397&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream