You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/30 02:17:59 UTC
svn commit: r1628340 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/sax/
tika-example/src/main/java/org/apache/tika/example/
tika-example/src/test/java/org/apache/tika/example/
tika-example/src/test/resources/org/apache/tika/example/ tika-pa...
Author: tpalsulich
Date: Tue Sep 30 00:17:58 2014
New Revision: 1628340
URL: http://svn.apache.org/r1628340
Log:
TIKA-1420, move the PhoneExtractingContentHandler to tika-core. Tests in tika-parsers.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java
- copied, changed from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java
- copied, changed from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
- copied, changed from r1627446, tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testPhoneNumberExtractor.odt
- copied unchanged from r1627446, tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
Removed:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java (from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java&p1=tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java Tue Sep 30 00:17:58 2014
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.tika.example;
+package org.apache.tika.sax;
import java.util.Locale;
import java.util.regex.Matcher;
Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java (from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java&p1=tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java Tue Sep 30 00:17:58 2014
@@ -15,9 +15,10 @@
* limitations under the License.
*/
-package org.apache.tika.example;
+package org.apache.tika.sax;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.CleanPhoneText;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java?rev=1628340&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java Tue Sep 30 00:17:58 2014
@@ -0,0 +1,87 @@
+package org.apache.tika.example;
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.PhoneExtractingContentHandler;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashSet;
+
+/**
+ * Class to demonstrate how to use the {@link org.apache.tika.sax.PhoneExtractingContentHandler}
+ * to get a list of all of the phone numbers from every file in a directory.
+ *
+ * You can run this main method by running
+ * <code>
+ * mvn exec:java -Dexec.mainClass="org.apache.tika.example.GrabPhoneNumbersExample" -Dexec.args="/path/to/directory"
+ * </code>
+ * from the tika-example directory.
+ */
+public class GrabPhoneNumbersExample {
+ private static HashSet<String> phoneNumbers = new HashSet<String>();
+ private static int failedFiles, successfulFiles = 0;
+
+ public static void main(String[] args){
+ if (args.length != 1) {
+ System.err.println("Usage `java GrabPhoneNumbers [corpus]");
+ return;
+ }
+ final File folder = new File(args[0]);
+ System.out.println("Searching " + folder.getAbsolutePath() + "...");
+ processFolder(folder);
+ System.out.println(phoneNumbers.toString());
+ System.out.println("Parsed " + successfulFiles + "/" + (successfulFiles + failedFiles));
+ }
+
+ public static void processFolder(final File folder) {
+ for (final File fileEntry : folder.listFiles()) {
+ if (fileEntry.isDirectory()) {
+ processFolder(fileEntry);
+ } else {
+ try {
+ process(fileEntry);
+ successfulFiles++;
+ } catch (Exception e) {
+ failedFiles++;
+ // Ignore this file...
+ }
+ }
+ }
+ }
+
+ public static void process(File file) throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
+ // to the underlying Handler.
+ PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+ InputStream stream = new FileInputStream(file);
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ finally {
+ stream.close();
+ }
+ String[] numbers = metadata.getValues("phonenumbers");
+ for (String number : numbers) {
+ phoneNumbers.add(number);
+ }
+ }
+}
Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1628340&r1=1628339&r2=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java Tue Sep 30 00:17:58 2014
@@ -25,10 +25,10 @@ import org.xml.sax.SAXException;
import java.io.IOException;
import java.util.List;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
import static org.apache.tika.TikaTest.assertContains;
import static org.apache.tika.TikaTest.assertNotContained;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
public class ContentHandlerExampleTest {
ContentHandlerExample example;
Copied: tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java (from r1627446, tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java?p2=tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java&p1=tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java Tue Sep 30 00:17:58 2014
@@ -15,21 +15,20 @@
* limitations under the License.
*/
-package org.apache.tika.example;
+package org.apache.tika.sax;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import java.io.InputStream;
-import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertTrue;
/**
- * Test class for the {@link org.apache.tika.example.PhoneExtractingContentHandler}
+ * Test class for the {@link org.apache.tika.sax.PhoneExtractingContentHandler}
* class. This demonstrates how to parse a document and retrieve any phone numbers
* found within.
*
@@ -44,7 +43,7 @@ public class PhoneExtractingContentHandl
// The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
// to the underlying Handler.
PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
- InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
+ InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt");
try {
parser.parse(stream, handler, metadata, new ParseContext());
}
@@ -60,4 +59,8 @@ public class PhoneExtractingContentHandl
assertContains("9044687081", phoneNumbers[5]);
assertContains("2604094811", phoneNumbers[6]);
}
+
+ private void assertContains(String needle, String haystack) {
+ assertTrue("'" + haystack + "' should contain '" + needle + "'", haystack.contains(needle));
+ }
}