You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/25 04:14:45 UTC

svn commit: r1627446 - in /tika/trunk/tika-example: ./ src/main/java/org/apache/tika/example/ src/test/java/org/apache/tika/example/ src/test/resources/org/apache/tika/example/

Author: tpalsulich
Date: Thu Sep 25 02:14:45 2014
New Revision: 1627446

URL: http://svn.apache.org/r1627446
Log:
TIKA-1420, refactor the phone number extraction to use a custom method of de-obfuscating numbers.

Added:
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
Modified:
    tika/trunk/tika-example/pom.xml
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
    tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
    tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt

Modified: tika/trunk/tika-example/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Thu Sep 25 02:14:45 2014
@@ -66,11 +66,6 @@
             <artifactId>tika-translate</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>com.googlecode.libphonenumber</groupId>
-            <artifactId>libphonenumber</artifactId>
-            <version>6.2.2</version>
-        </dependency>
 
 
         <dependency>

Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java?rev=1627446&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java Thu Sep 25 02:14:45 2014
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.ArrayList;
+
+/**
+ * Class to help de-obfuscate phone numbers in text.
+ */
+public class CleanPhoneText {
+    // Regex to identify a phone number
+    static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})";
+
+    // Regex which attempts to ignore punctuation and other distractions.
+    static final String phoneRegex = "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)";
+
+    public static ArrayList<String> extractPhoneNumbers(String text) {
+        text = clean(text);
+        int idx = 0;
+        Pattern p = Pattern.compile(cleanPhoneRegex);
+        Matcher m = p.matcher(text);
+        ArrayList<String> phoneNumbers = new ArrayList<String>();
+        while (m.find(idx)) {
+            String digits = m.group(1);
+            int start = m.start(1);
+            int end = m.end(1);
+            String prefix = "";
+            if (start > 0) {
+                prefix = text.substring(start-1, start);
+            }
+            if (digits.substring(0, 2).equals("82") && prefix.equals("*")) {
+                // this number overlaps with a *82 sequence
+                idx += 2;
+            } else {
+                // seems good
+                phoneNumbers.add(digits);
+                idx = end;
+            }
+        }
+        return phoneNumbers;
+    }
+
+    public static String clean(String text) {
+        text = text.toLowerCase(Locale.ROOT);
+        for (String[][] group : cleanSubstitutions) {
+            for (String[] sub : group) {
+                text = text.replaceAll(sub[0], sub[1]);
+            }
+        }
+        // Delete all non-digits and white space.
+        text = text.replaceAll("[\\D+\\s]", "");
+        return text;
+    }
+
+
+    public static final String[][][] cleanSubstitutions = new String[][][]{
+            {{"&#\\d{1,3};", ""}},         // first simply remove numeric entities
+            {{"th0usand", "thousand"},    // handle common misspellings
+                    {"th1rteen", "thirteen"},
+                    {"f0urteen", "fourteen"},
+                    {"e1ghteen", "eighteen"},
+                    {"n1neteen", "nineteen"},
+                    {"f1fteen", "fifteen"},
+                    {"s1xteen", "sixteen"},
+                    {"th1rty", "thirty"},
+                    {"e1ghty", "eighty"},
+                    {"n1nety", "ninety"},
+                    {"fourty", "forty"},
+                    {"f0urty", "forty"},
+                    {"e1ght", "eight"},
+                    {"f0rty", "forty"},
+                    {"f1fty", "fifty"},
+                    {"s1xty", "sixty"},
+                    {"zer0", "zero"},
+                    {"f0ur", "four"},
+                    {"f1ve", "five"},
+                    {"n1ne", "nine"},
+                    {"0ne", "one"},
+                    {"tw0", "two"},
+                    {"s1x", "six"}},
+            // mixed compound numeral words
+            // consider 7teen, etc.
+            {{"twenty[\\W_]{0,3}1", "twenty-one"},
+                    {"twenty[\\W_]{0,3}2", "twenty-two"},
+                    {"twenty[\\W_]{0,3}3", "twenty-three"},
+                    {"twenty[\\W_]{0,3}4", "twenty-four"},
+                    {"twenty[\\W_]{0,3}5", "twenty-five"},
+                    {"twenty[\\W_]{0,3}6", "twenty-six"},
+                    {"twenty[\\W_]{0,3}7", "twenty-seven"},
+                    {"twenty[\\W_]{0,3}8", "twenty-eight"},
+                    {"twenty[\\W_]{0,3}9", "twenty-nine"},
+                    {"thirty[\\W_]{0,3}1", "thirty-one"},
+                    {"thirty[\\W_]{0,3}2", "thirty-two"},
+                    {"thirty[\\W_]{0,3}3", "thirty-three"},
+                    {"thirty[\\W_]{0,3}4", "thirty-four"},
+                    {"thirty[\\W_]{0,3}5", "thirty-five"},
+                    {"thirty[\\W_]{0,3}6", "thirty-six"},
+                    {"thirty[\\W_]{0,3}7", "thirty-seven"},
+                    {"thirty[\\W_]{0,3}8", "thirty-eight"},
+                    {"thirty[\\W_]{0,3}9", "thirty-nine"},
+                    {"forty[\\W_]{0,3}1", "forty-one"},
+                    {"forty[\\W_]{0,3}2", "forty-two"},
+                    {"forty[\\W_]{0,3}3", "forty-three"},
+                    {"forty[\\W_]{0,3}4", "forty-four"},
+                    {"forty[\\W_]{0,3}5", "forty-five"},
+                    {"forty[\\W_]{0,3}6", "forty-six"},
+                    {"forty[\\W_]{0,3}7", "forty-seven"},
+                    {"forty[\\W_]{0,3}8", "forty-eight"},
+                    {"forty[\\W_]{0,3}9", "forty-nine"},
+                    {"fifty[\\W_]{0,3}1", "fifty-one"},
+                    {"fifty[\\W_]{0,3}2", "fifty-two"},
+                    {"fifty[\\W_]{0,3}3", "fifty-three"},
+                    {"fifty[\\W_]{0,3}4", "fifty-four"},
+                    {"fifty[\\W_]{0,3}5", "fifty-five"},
+                    {"fifty[\\W_]{0,3}6", "fifty-six"},
+                    {"fifty[\\W_]{0,3}7", "fifty-seven"},
+                    {"fifty[\\W_]{0,3}8", "fifty-eight"},
+                    {"fifty[\\W_]{0,3}9", "fifty-nine"},
+                    {"sixty[\\W_]{0,3}1", "sixty-one"},
+                    {"sixty[\\W_]{0,3}2", "sixty-two"},
+                    {"sixty[\\W_]{0,3}3", "sixty-three"},
+                    {"sixty[\\W_]{0,3}4", "sixty-four"},
+                    {"sixty[\\W_]{0,3}5", "sixty-five"},
+                    {"sixty[\\W_]{0,3}6", "sixty-six"},
+                    {"sixty[\\W_]{0,3}7", "sixty-seven"},
+                    {"sixty[\\W_]{0,3}8", "sixty-eight"},
+                    {"sixty[\\W_]{0,3}9", "sixty-nine"},
+                    {"seventy[\\W_]{0,3}1", "seventy-one"},
+                    {"seventy[\\W_]{0,3}2", "seventy-two"},
+                    {"seventy[\\W_]{0,3}3", "seventy-three"},
+                    {"seventy[\\W_]{0,3}4", "seventy-four"},
+                    {"seventy[\\W_]{0,3}5", "seventy-five"},
+                    {"seventy[\\W_]{0,3}6", "seventy-six"},
+                    {"seventy[\\W_]{0,3}7", "seventy-seven"},
+                    {"seventy[\\W_]{0,3}8", "seventy-eight"},
+                    {"seventy[\\W_]{0,3}9", "seventy-nine"},
+                    {"eighty[\\W_]{0,3}1", "eighty-one"},
+                    {"eighty[\\W_]{0,3}2", "eighty-two"},
+                    {"eighty[\\W_]{0,3}3", "eighty-three"},
+                    {"eighty[\\W_]{0,3}4", "eighty-four"},
+                    {"eighty[\\W_]{0,3}5", "eighty-five"},
+                    {"eighty[\\W_]{0,3}6", "eighty-six"},
+                    {"eighty[\\W_]{0,3}7", "eighty-seven"},
+                    {"eighty[\\W_]{0,3}8", "eighty-eight"},
+                    {"eighty[\\W_]{0,3}9", "eighty-nine"},
+                    {"ninety[\\W_]{0,3}1", "ninety-one"},
+                    {"ninety[\\W_]{0,3}2", "ninety-two"},
+                    {"ninety[\\W_]{0,3}3", "ninety-three"},
+                    {"ninety[\\W_]{0,3}4", "ninety-four"},
+                    {"ninety[\\W_]{0,3}5", "ninety-five"},
+                    {"ninety[\\W_]{0,3}6", "ninety-six"},
+                    {"ninety[\\W_]{0,3}7", "ninety-seven"},
+                    {"ninety[\\W_]{0,3}8", "ninety-eight"},
+                    {"ninety[\\W_]{0,3}9", "ninety-nine"}},
+            // now resolve compound numeral words
+            {{"twenty-one", "21"},
+                    {"twenty-two", "22"},
+                    {"twenty-three", "23"},
+                    {"twenty-four", "24"},
+                    {"twenty-five", "25"},
+                    {"twenty-six", "26"},
+                    {"twenty-seven", "27"},
+                    {"twenty-eight", "28"},
+                    {"twenty-nine", "29"},
+                    {"thirty-one", "31"},
+                    {"thirty-two", "32"},
+                    {"thirty-three", "33"},
+                    {"thirty-four", "34"},
+                    {"thirty-five", "35"},
+                    {"thirty-six", "36"},
+                    {"thirty-seven", "37"},
+                    {"thirty-eight", "38"},
+                    {"thirty-nine", "39"},
+                    {"forty-one", "41"},
+                    {"forty-two", "42"},
+                    {"forty-three", "43"},
+                    {"forty-four", "44"},
+                    {"forty-five", "45"},
+                    {"forty-six", "46"},
+                    {"forty-seven", "47"},
+                    {"forty-eight", "48"},
+                    {"forty-nine", "49"},
+                    {"fifty-one", "51"},
+                    {"fifty-two", "52"},
+                    {"fifty-three", "53"},
+                    {"fifty-four", "54"},
+                    {"fifty-five", "55"},
+                    {"fifty-six", "56"},
+                    {"fifty-seven", "57"},
+                    {"fifty-eight", "58"},
+                    {"fifty-nine", "59"},
+                    {"sixty-one", "61"},
+                    {"sixty-two", "62"},
+                    {"sixty-three", "63"},
+                    {"sixty-four", "64"},
+                    {"sixty-five", "65"},
+                    {"sixty-six", "66"},
+                    {"sixty-seven", "67"},
+                    {"sixty-eight", "68"},
+                    {"sixty-nine", "69"},
+                    {"seventy-one", "71"},
+                    {"seventy-two", "72"},
+                    {"seventy-three", "73"},
+                    {"seventy-four", "74"},
+                    {"seventy-five", "75"},
+                    {"seventy-six", "76"},
+                    {"seventy-seven", "77"},
+                    {"seventy-eight", "78"},
+                    {"seventy-nine", "79"},
+                    {"eighty-one", "81"},
+                    {"eighty-two", "82"},
+                    {"eighty-three", "83"},
+                    {"eighty-four", "84"},
+                    {"eighty-five", "85"},
+                    {"eighty-six", "86"},
+                    {"eighty-seven", "87"},
+                    {"eighty-eight", "88"},
+                    {"eighty-nine", "89"},
+                    {"ninety-one", "91"},
+                    {"ninety-two", "92"},
+                    {"ninety-three", "93"},
+                    {"ninety-four", "94"},
+                    {"ninety-five", "95"},
+                    {"ninety-six", "96"},
+                    {"ninety-seven", "97"},
+                    {"ninety-eight", "98"},
+                    {"ninety-nine", "99"}},
+            // larger units function as suffixes now
+            // assume never have three hundred four, three hundred and four
+            {{"hundred", "00"},
+                    {"thousand", "000"}},
+            // single numeral words now
+            // some would have been ambiguous
+            {{"seventeen", "17"},
+                    {"thirteen", "13"},
+                    {"fourteen", "14"},
+                    {"eighteen", "18"},
+                    {"nineteen", "19"},
+                    {"fifteen", "15"},
+                    {"sixteen", "16"},
+                    {"seventy", "70"},
+                    {"eleven", "11"},
+                    {"twelve", "12"},
+                    {"twenty", "20"},
+                    {"thirty", "30"},
+                    {"eighty", "80"},
+                    {"ninety", "90"},
+                    {"three", "3"},
+                    {"seven", "7"},
+                    {"eight", "8"},
+                    {"forty", "40"},
+                    {"fifty", "50"},
+                    {"sixty", "60"},
+                    {"zero", "0"},
+                    {"four", "4"},
+                    {"five", "5"},
+                    {"nine", "9"},
+                    {"one", "1"},
+                    {"two", "2"},
+                    {"six", "6"},
+                    {"ten", "10"}},
+            // now do letter for digit substitutions
+            {{"oh", "0"},
+                    {"o", "0"},
+                    {"i", "1"},
+                    {"l", "1"}}
+    };
+}
\ No newline at end of file

Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java Thu Sep 25 02:14:45 2014
@@ -17,33 +17,52 @@
 
 package org.apache.tika.example;
 
-import com.google.i18n.phonenumbers.PhoneNumberMatch;
-import com.google.i18n.phonenumbers.PhoneNumberUtil;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
 import java.util.Arrays;
-import java.util.Iterator;
+import java.util.List;
 
+/**
+ * Class used to extract phone numbers while parsing.
+ *
+ * Every time a document is parsed in Tika, the content is split into SAX events.
+ * Those SAX events are handled by a ContentHandler. You can think of these events
+ * as marking a tag in an HTML file. Once you're finished parsing, you can call
+ * handler.toString(), for example, to get the text contents of the file. On the other
+ * hand, any of the metadata of the file will be added to the Metadata object passed
+ * in during the parse() call.  So, the Parser class sends metadata to the Metadata
+ * object and content to the ContentHandler.
+ *
+ * This class is an example of how to combine a ContentHandler and a Metadata.
+ * As content is passed to the handler, we first check to see if it matches a
+ * textual pattern for a phone number. If the extracted content is a phone number,
+ * we add it to the metadata under the key "phonenumbers". So, if you used this
+ * ContentHandler when you parsed a document, then called
+ * metadata.getValues("phonenumbers"), you would get an array of Strings of phone
+ * numbers found in the document.
+ *
+ * Please see the PhoneExtractingContentHandlerTest for an example of how to use
+ * this class.
+ *
+ */
 public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
-    protected final PhoneNumberUtil phoneUtil;
     private Metadata metadata;
     private static final String PHONE_NUMBERS = "phonenumbers";
+    private StringBuilder stringBuilder;
 
     /**
-     * Creates a decorator for the given SAX event handler.
+     * Creates a decorator for the given SAX event handler and Metadata object.
      *
      * @param handler SAX event handler to be decorated
      */
     public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) {
         super(handler);
-        phoneUtil = PhoneNumberUtil.getInstance();
         this.metadata = metadata;
+        this.stringBuilder = new StringBuilder();
     }
 
     /**
@@ -51,21 +70,41 @@ public class PhoneExtractingContentHandl
      * a dummy content handler that simply ignores all the events. Subclasses
      * should use the {@link #setContentHandler(ContentHandler)} method to
      * switch to a more usable underlying content handler.
+     * Also creates a dummy Metadata object to store phone numbers in.
      */
     protected PhoneExtractingContentHandler() {
         this(new DefaultHandler(), new Metadata());
     }
 
+    /**
+     * The characters method is called whenever a Parser wants to pass raw...
+     * characters to the ContentHandler. But, sometimes, phone numbers are split
+     * accross different calls to characters, depending on the specific Parser
+     * used. So, we simply add all characters to a StringBuilder and analyze it
+     * once the document is finished.
+     */
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
         try {
             String text = new String(Arrays.copyOfRange(ch, start, start + length));
-            for (PhoneNumberMatch match : phoneUtil.findNumbers(text, "US")) {
-                metadata.add(PHONE_NUMBERS, match.number().toString());
-            }
+            stringBuilder.append(text);
             super.characters(ch, start, length);
         } catch (SAXException e) {
             handleException(e);
         }
     }
+
+
+    /**
+     * This method is called whenever the Parser is done parsing the file. So,
+     * we check the output for any phone numbers.
+     */
+    @Override
+    public void endDocument() throws SAXException {
+        super.endDocument();
+        List<String> numbers = CleanPhoneText.extractPhoneNumbers(stringBuilder.toString());
+        for (String number : numbers) {
+            metadata.add(PHONE_NUMBERS, number);
+        }
+    }
 }

Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java Thu Sep 25 02:14:45 2014
@@ -41,7 +41,7 @@ public class PhoneExtractingContentHandl
     public void testExtractPhoneNumbers() throws Exception {
         Parser parser = new AutoDetectParser();
         Metadata metadata = new Metadata();
-        // The PhoneExtractingContentHandler will examine any characters for phone numbers, before passing them
+        // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
         // to the underlying Handler.
         PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
         InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
@@ -55,5 +55,9 @@ public class PhoneExtractingContentHandl
         assertContains("9498888888", phoneNumbers[0]);
         assertContains("9497777777", phoneNumbers[1]);
         assertContains("9496666666", phoneNumbers[2]);
+        assertContains("9495555555", phoneNumbers[3]);
+        assertContains("4193404645", phoneNumbers[4]);
+        assertContains("9044687081", phoneNumbers[5]);
+        assertContains("2604094811", phoneNumbers[6]);
     }
 }

Modified: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
Binary files - no diff available.