You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/25 04:14:45 UTC
svn commit: r1627446 - in /tika/trunk/tika-example: ./
src/main/java/org/apache/tika/example/ src/test/java/org/apache/tika/example/
src/test/resources/org/apache/tika/example/
Author: tpalsulich
Date: Thu Sep 25 02:14:45 2014
New Revision: 1627446
URL: http://svn.apache.org/r1627446
Log:
TIKA-1420, refactor the phone number extraction to use a custom method of de-obfuscating numbers.
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
Modified:
tika/trunk/tika-example/pom.xml
tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
Modified: tika/trunk/tika-example/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Thu Sep 25 02:14:45 2014
@@ -66,11 +66,6 @@
<artifactId>tika-translate</artifactId>
<version>${project.version}</version>
</dependency>
- <dependency>
- <groupId>com.googlecode.libphonenumber</groupId>
- <artifactId>libphonenumber</artifactId>
- <version>6.2.2</version>
- </dependency>
<dependency>
Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java?rev=1627446&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java Thu Sep 25 02:14:45 2014
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.ArrayList;
+
+/**
+ * Class to help de-obfuscate phone numbers in text.
+ */
+public class CleanPhoneText {
+ // Regex to identify a phone number
+ static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})";
+
+ // Regex which attempts to ignore punctuation and other distractions.
+ static final String phoneRegex = "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)";
+
+ public static ArrayList<String> extractPhoneNumbers(String text) {
+ text = clean(text);
+ int idx = 0;
+ Pattern p = Pattern.compile(cleanPhoneRegex);
+ Matcher m = p.matcher(text);
+ ArrayList<String> phoneNumbers = new ArrayList<String>();
+ while (m.find(idx)) {
+ String digits = m.group(1);
+ int start = m.start(1);
+ int end = m.end(1);
+ String prefix = "";
+ if (start > 0) {
+ prefix = text.substring(start-1, start);
+ }
+ if (digits.substring(0, 2).equals("82") && prefix.equals("*")) {
+ // this number overlaps with a *82 sequence
+ idx += 2;
+ } else {
+ // seems good
+ phoneNumbers.add(digits);
+ idx = end;
+ }
+ }
+ return phoneNumbers;
+ }
+
+ public static String clean(String text) {
+ text = text.toLowerCase(Locale.ROOT);
+ for (String[][] group : cleanSubstitutions) {
+ for (String[] sub : group) {
+ text = text.replaceAll(sub[0], sub[1]);
+ }
+ }
+ // Delete all non-digits and white space.
+ text = text.replaceAll("[\\D+\\s]", "");
+ return text;
+ }
+
+
+ public static final String[][][] cleanSubstitutions = new String[][][]{
+ {{"&#\\d{1,3};", ""}}, // first simply remove numeric entities
+ {{"th0usand", "thousand"}, // handle common misspellings
+ {"th1rteen", "thirteen"},
+ {"f0urteen", "fourteen"},
+ {"e1ghteen", "eighteen"},
+ {"n1neteen", "nineteen"},
+ {"f1fteen", "fifteen"},
+ {"s1xteen", "sixteen"},
+ {"th1rty", "thirty"},
+ {"e1ghty", "eighty"},
+ {"n1nety", "ninety"},
+ {"fourty", "forty"},
+ {"f0urty", "forty"},
+ {"e1ght", "eight"},
+ {"f0rty", "forty"},
+ {"f1fty", "fifty"},
+ {"s1xty", "sixty"},
+ {"zer0", "zero"},
+ {"f0ur", "four"},
+ {"f1ve", "five"},
+ {"n1ne", "nine"},
+ {"0ne", "one"},
+ {"tw0", "two"},
+ {"s1x", "six"}},
+ // mixed compound numeral words
+ // consider 7teen, etc.
+ {{"twenty[\\W_]{0,3}1", "twenty-one"},
+ {"twenty[\\W_]{0,3}2", "twenty-two"},
+ {"twenty[\\W_]{0,3}3", "twenty-three"},
+ {"twenty[\\W_]{0,3}4", "twenty-four"},
+ {"twenty[\\W_]{0,3}5", "twenty-five"},
+ {"twenty[\\W_]{0,3}6", "twenty-six"},
+ {"twenty[\\W_]{0,3}7", "twenty-seven"},
+ {"twenty[\\W_]{0,3}8", "twenty-eight"},
+ {"twenty[\\W_]{0,3}9", "twenty-nine"},
+ {"thirty[\\W_]{0,3}1", "thirty-one"},
+ {"thirty[\\W_]{0,3}2", "thirty-two"},
+ {"thirty[\\W_]{0,3}3", "thirty-three"},
+ {"thirty[\\W_]{0,3}4", "thirty-four"},
+ {"thirty[\\W_]{0,3}5", "thirty-five"},
+ {"thirty[\\W_]{0,3}6", "thirty-six"},
+ {"thirty[\\W_]{0,3}7", "thirty-seven"},
+ {"thirty[\\W_]{0,3}8", "thirty-eight"},
+ {"thirty[\\W_]{0,3}9", "thirty-nine"},
+ {"forty[\\W_]{0,3}1", "forty-one"},
+ {"forty[\\W_]{0,3}2", "forty-two"},
+ {"forty[\\W_]{0,3}3", "forty-three"},
+ {"forty[\\W_]{0,3}4", "forty-four"},
+ {"forty[\\W_]{0,3}5", "forty-five"},
+ {"forty[\\W_]{0,3}6", "forty-six"},
+ {"forty[\\W_]{0,3}7", "forty-seven"},
+ {"forty[\\W_]{0,3}8", "forty-eight"},
+ {"forty[\\W_]{0,3}9", "forty-nine"},
+ {"fifty[\\W_]{0,3}1", "fifty-one"},
+ {"fifty[\\W_]{0,3}2", "fifty-two"},
+ {"fifty[\\W_]{0,3}3", "fifty-three"},
+ {"fifty[\\W_]{0,3}4", "fifty-four"},
+ {"fifty[\\W_]{0,3}5", "fifty-five"},
+ {"fifty[\\W_]{0,3}6", "fifty-six"},
+ {"fifty[\\W_]{0,3}7", "fifty-seven"},
+ {"fifty[\\W_]{0,3}8", "fifty-eight"},
+ {"fifty[\\W_]{0,3}9", "fifty-nine"},
+ {"sixty[\\W_]{0,3}1", "sixty-one"},
+ {"sixty[\\W_]{0,3}2", "sixty-two"},
+ {"sixty[\\W_]{0,3}3", "sixty-three"},
+ {"sixty[\\W_]{0,3}4", "sixty-four"},
+ {"sixty[\\W_]{0,3}5", "sixty-five"},
+ {"sixty[\\W_]{0,3}6", "sixty-six"},
+ {"sixty[\\W_]{0,3}7", "sixty-seven"},
+ {"sixty[\\W_]{0,3}8", "sixty-eight"},
+ {"sixty[\\W_]{0,3}9", "sixty-nine"},
+ {"seventy[\\W_]{0,3}1", "seventy-one"},
+ {"seventy[\\W_]{0,3}2", "seventy-two"},
+ {"seventy[\\W_]{0,3}3", "seventy-three"},
+ {"seventy[\\W_]{0,3}4", "seventy-four"},
+ {"seventy[\\W_]{0,3}5", "seventy-five"},
+ {"seventy[\\W_]{0,3}6", "seventy-six"},
+ {"seventy[\\W_]{0,3}7", "seventy-seven"},
+ {"seventy[\\W_]{0,3}8", "seventy-eight"},
+ {"seventy[\\W_]{0,3}9", "seventy-nine"},
+ {"eighty[\\W_]{0,3}1", "eighty-one"},
+ {"eighty[\\W_]{0,3}2", "eighty-two"},
+ {"eighty[\\W_]{0,3}3", "eighty-three"},
+ {"eighty[\\W_]{0,3}4", "eighty-four"},
+ {"eighty[\\W_]{0,3}5", "eighty-five"},
+ {"eighty[\\W_]{0,3}6", "eighty-six"},
+ {"eighty[\\W_]{0,3}7", "eighty-seven"},
+ {"eighty[\\W_]{0,3}8", "eighty-eight"},
+ {"eighty[\\W_]{0,3}9", "eighty-nine"},
+ {"ninety[\\W_]{0,3}1", "ninety-one"},
+ {"ninety[\\W_]{0,3}2", "ninety-two"},
+ {"ninety[\\W_]{0,3}3", "ninety-three"},
+ {"ninety[\\W_]{0,3}4", "ninety-four"},
+ {"ninety[\\W_]{0,3}5", "ninety-five"},
+ {"ninety[\\W_]{0,3}6", "ninety-six"},
+ {"ninety[\\W_]{0,3}7", "ninety-seven"},
+ {"ninety[\\W_]{0,3}8", "ninety-eight"},
+ {"ninety[\\W_]{0,3}9", "ninety-nine"}},
+ // now resolve compound numeral words
+ {{"twenty-one", "21"},
+ {"twenty-two", "22"},
+ {"twenty-three", "23"},
+ {"twenty-four", "24"},
+ {"twenty-five", "25"},
+ {"twenty-six", "26"},
+ {"twenty-seven", "27"},
+ {"twenty-eight", "28"},
+ {"twenty-nine", "29"},
+ {"thirty-one", "31"},
+ {"thirty-two", "32"},
+ {"thirty-three", "33"},
+ {"thirty-four", "34"},
+ {"thirty-five", "35"},
+ {"thirty-six", "36"},
+ {"thirty-seven", "37"},
+ {"thirty-eight", "38"},
+ {"thirty-nine", "39"},
+ {"forty-one", "41"},
+ {"forty-two", "42"},
+ {"forty-three", "43"},
+ {"forty-four", "44"},
+ {"forty-five", "45"},
+ {"forty-six", "46"},
+ {"forty-seven", "47"},
+ {"forty-eight", "48"},
+ {"forty-nine", "49"},
+ {"fifty-one", "51"},
+ {"fifty-two", "52"},
+ {"fifty-three", "53"},
+ {"fifty-four", "54"},
+ {"fifty-five", "55"},
+ {"fifty-six", "56"},
+ {"fifty-seven", "57"},
+ {"fifty-eight", "58"},
+ {"fifty-nine", "59"},
+ {"sixty-one", "61"},
+ {"sixty-two", "62"},
+ {"sixty-three", "63"},
+ {"sixty-four", "64"},
+ {"sixty-five", "65"},
+ {"sixty-six", "66"},
+ {"sixty-seven", "67"},
+ {"sixty-eight", "68"},
+ {"sixty-nine", "69"},
+ {"seventy-one", "71"},
+ {"seventy-two", "72"},
+ {"seventy-three", "73"},
+ {"seventy-four", "74"},
+ {"seventy-five", "75"},
+ {"seventy-six", "76"},
+ {"seventy-seven", "77"},
+ {"seventy-eight", "78"},
+ {"seventy-nine", "79"},
+ {"eighty-one", "81"},
+ {"eighty-two", "82"},
+ {"eighty-three", "83"},
+ {"eighty-four", "84"},
+ {"eighty-five", "85"},
+ {"eighty-six", "86"},
+ {"eighty-seven", "87"},
+ {"eighty-eight", "88"},
+ {"eighty-nine", "89"},
+ {"ninety-one", "91"},
+ {"ninety-two", "92"},
+ {"ninety-three", "93"},
+ {"ninety-four", "94"},
+ {"ninety-five", "95"},
+ {"ninety-six", "96"},
+ {"ninety-seven", "97"},
+ {"ninety-eight", "98"},
+ {"ninety-nine", "99"}},
+ // larger units function as suffixes now
+ // assume never have three hundred four, three hundred and four
+ {{"hundred", "00"},
+ {"thousand", "000"}},
+ // single numeral words now
+ // some would have been ambiguous
+ {{"seventeen", "17"},
+ {"thirteen", "13"},
+ {"fourteen", "14"},
+ {"eighteen", "18"},
+ {"nineteen", "19"},
+ {"fifteen", "15"},
+ {"sixteen", "16"},
+ {"seventy", "70"},
+ {"eleven", "11"},
+ {"twelve", "12"},
+ {"twenty", "20"},
+ {"thirty", "30"},
+ {"eighty", "80"},
+ {"ninety", "90"},
+ {"three", "3"},
+ {"seven", "7"},
+ {"eight", "8"},
+ {"forty", "40"},
+ {"fifty", "50"},
+ {"sixty", "60"},
+ {"zero", "0"},
+ {"four", "4"},
+ {"five", "5"},
+ {"nine", "9"},
+ {"one", "1"},
+ {"two", "2"},
+ {"six", "6"},
+ {"ten", "10"}},
+ // now do letter for digit substitutions
+ {{"oh", "0"},
+ {"o", "0"},
+ {"i", "1"},
+ {"l", "1"}}
+ };
+}
\ No newline at end of file
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java Thu Sep 25 02:14:45 2014
@@ -17,33 +17,52 @@
package org.apache.tika.example;
-import com.google.i18n.phonenumbers.PhoneNumberMatch;
-import com.google.i18n.phonenumbers.PhoneNumberUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
import java.util.Arrays;
-import java.util.Iterator;
+import java.util.List;
+/**
+ * Class used to extract phone numbers while parsing.
+ *
+ * Every time a document is parsed in Tika, the content is split into SAX events.
+ * Those SAX events are handled by a ContentHandler. You can think of these events
+ * as marking a tag in an HTML file. Once you're finished parsing, you can call
+ * handler.toString(), for example, to get the text contents of the file. On the other
+ * hand, any of the metadata of the file will be added to the Metadata object passed
+ * in during the parse() call. So, the Parser class sends metadata to the Metadata
+ * object and content to the ContentHandler.
+ *
+ * This class is an example of how to combine a ContentHandler and a Metadata.
+ * As content is passed to the handler, we first check to see if it matches a
+ * textual pattern for a phone number. If the extracted content is a phone number,
+ * we add it to the metadata under the key "phonenumbers". So, if you used this
+ * ContentHandler when you parsed a document, then called
+ * metadata.getValues("phonenumbers"), you would get an array of Strings of phone
+ * numbers found in the document.
+ *
+ * Please see the PhoneExtractingContentHandlerTest for an example of how to use
+ * this class.
+ *
+ */
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
- protected final PhoneNumberUtil phoneUtil;
private Metadata metadata;
private static final String PHONE_NUMBERS = "phonenumbers";
+ private StringBuilder stringBuilder;
/**
- * Creates a decorator for the given SAX event handler.
+ * Creates a decorator for the given SAX event handler and Metadata object.
*
* @param handler SAX event handler to be decorated
*/
public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
- phoneUtil = PhoneNumberUtil.getInstance();
this.metadata = metadata;
+ this.stringBuilder = new StringBuilder();
}
/**
@@ -51,21 +70,41 @@ public class PhoneExtractingContentHandl
* a dummy content handler that simply ignores all the events. Subclasses
* should use the {@link #setContentHandler(ContentHandler)} method to
* switch to a more usable underlying content handler.
+ * Also creates a dummy Metadata object to store phone numbers in.
*/
protected PhoneExtractingContentHandler() {
this(new DefaultHandler(), new Metadata());
}
+ /**
+ * The characters method is called whenever a Parser wants to pass raw...
+ * characters to the ContentHandler. But, sometimes, phone numbers are split
+ * accross different calls to characters, depending on the specific Parser
+ * used. So, we simply add all characters to a StringBuilder and analyze it
+ * once the document is finished.
+ */
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
try {
String text = new String(Arrays.copyOfRange(ch, start, start + length));
- for (PhoneNumberMatch match : phoneUtil.findNumbers(text, "US")) {
- metadata.add(PHONE_NUMBERS, match.number().toString());
- }
+ stringBuilder.append(text);
super.characters(ch, start, length);
} catch (SAXException e) {
handleException(e);
}
}
+
+
+ /**
+ * This method is called whenever the Parser is done parsing the file. So,
+ * we check the output for any phone numbers.
+ */
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ List<String> numbers = CleanPhoneText.extractPhoneNumbers(stringBuilder.toString());
+ for (String number : numbers) {
+ metadata.add(PHONE_NUMBERS, number);
+ }
+ }
}
Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java Thu Sep 25 02:14:45 2014
@@ -41,7 +41,7 @@ public class PhoneExtractingContentHandl
public void testExtractPhoneNumbers() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
- // The PhoneExtractingContentHandler will examine any characters for phone numbers, before passing them
+ // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
// to the underlying Handler.
PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
@@ -55,5 +55,9 @@ public class PhoneExtractingContentHandl
assertContains("9498888888", phoneNumbers[0]);
assertContains("9497777777", phoneNumbers[1]);
assertContains("9496666666", phoneNumbers[2]);
+ assertContains("9495555555", phoneNumbers[3]);
+ assertContains("4193404645", phoneNumbers[4]);
+ assertContains("9044687081", phoneNumbers[5]);
+ assertContains("2604094811", phoneNumbers[6]);
}
}
Modified: tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt?rev=1627446&r1=1627445&r2=1627446&view=diff
==============================================================================
Binary files - no diff available.