You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/30 02:17:59 UTC

svn commit: r1628340 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/sax/ tika-example/src/main/java/org/apache/tika/example/ tika-example/src/test/java/org/apache/tika/example/ tika-example/src/test/resources/org/apache/tika/example/ tika-pa...

Author: tpalsulich
Date: Tue Sep 30 00:17:58 2014
New Revision: 1628340

URL: http://svn.apache.org/r1628340
Log:
TIKA-1420, move the PhoneExtractingContentHandler to tika-core. Tests in tika-parsers.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java
      - copied, changed from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java
      - copied, changed from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
      - copied, changed from r1627446, tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPhoneNumberExtractor.odt
      - copied unchanged from r1627446, tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
Removed:
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java
    tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java
    tika/trunk/tika-example/src/test/resources/org/apache/tika/example/testPhoneNumberExtractor.odt
Modified:
    tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java

Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java (from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java&p1=tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/CleanPhoneText.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java Tue Sep 30 00:17:58 2014
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.tika.example;
+package org.apache.tika.sax;
 
 import java.util.Locale;
 import java.util.regex.Matcher;

Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java (from r1627446, tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java&p1=tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PhoneExtractingContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java Tue Sep 30 00:17:58 2014
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.tika.example;
+package org.apache.tika.sax;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.CleanPhoneText;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;

Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java?rev=1628340&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java Tue Sep 30 00:17:58 2014
@@ -0,0 +1,87 @@
+package org.apache.tika.example;
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.PhoneExtractingContentHandler;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashSet;
+
+/**
+ * Class to demonstrate how to use the {@link org.apache.tika.sax.PhoneExtractingContentHandler}
+ * to get a list of all of the phone numbers from every file in a directory.
+ *
+ * You can run this main method by running
+ * <code>
+ *     mvn exec:java -Dexec.mainClass="org.apache.tika.example.GrabPhoneNumbersExample" -Dexec.args="/path/to/directory"
+ * </code>
+ * from the tika-example directory.
+ */
+public class GrabPhoneNumbersExample {
+    private static HashSet<String> phoneNumbers = new HashSet<String>();
+    private static int failedFiles, successfulFiles = 0;
+
+    public static void main(String[] args){
+        if (args.length != 1) {
+            System.err.println("Usage `java GrabPhoneNumbers [corpus]");
+            return;
+        }
+        final File folder = new File(args[0]);
+        System.out.println("Searching " + folder.getAbsolutePath() + "...");
+        processFolder(folder);
+        System.out.println(phoneNumbers.toString());
+        System.out.println("Parsed " + successfulFiles + "/" + (successfulFiles + failedFiles));
+    }
+
+    public static void processFolder(final File folder) {
+        for (final File fileEntry : folder.listFiles()) {
+            if (fileEntry.isDirectory()) {
+                processFolder(fileEntry);
+            } else {
+                try {
+                    process(fileEntry);
+                    successfulFiles++;
+                } catch (Exception e) {
+                    failedFiles++;
+                    // Ignore this file...
+                }
+            }
+        }
+    }
+
+    public static void process(File file) throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
+        // to the underlying Handler.
+        PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+        InputStream stream = new FileInputStream(file);
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+        finally {
+            stream.close();
+        }
+        String[] numbers = metadata.getValues("phonenumbers");
+        for (String number : numbers) {
+            phoneNumbers.add(number);
+        }
+    }
+}

Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1628340&r1=1628339&r2=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java Tue Sep 30 00:17:58 2014
@@ -25,10 +25,10 @@ import org.xml.sax.SAXException;
 import java.io.IOException;
 import java.util.List;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
 import static org.apache.tika.TikaTest.assertContains;
 import static org.apache.tika.TikaTest.assertNotContained;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class ContentHandlerExampleTest {
     ContentHandlerExample example;

Copied: tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java (from r1627446, tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java?p2=tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java&p1=tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java&r1=1627446&r2=1628340&rev=1628340&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/PhoneExtractingContentHandlerTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java Tue Sep 30 00:17:58 2014
@@ -15,21 +15,20 @@
  * limitations under the License.
  */
 
-package org.apache.tika.example;
+package org.apache.tika.sax;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 
 import java.io.InputStream;
 
-import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertTrue;
 
 /**
- * Test class for the {@link org.apache.tika.example.PhoneExtractingContentHandler}
+ * Test class for the {@link org.apache.tika.sax.PhoneExtractingContentHandler}
  * class. This demonstrates how to parse a document and retrieve any phone numbers
  * found within.
  *
@@ -44,7 +43,7 @@ public class PhoneExtractingContentHandl
         // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
         // to the underlying Handler.
         PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
-        InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("testPhoneNumberExtractor.odt");
+        InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt");
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
         }
@@ -60,4 +59,8 @@ public class PhoneExtractingContentHandl
         assertContains("9044687081", phoneNumbers[5]);
         assertContains("2604094811", phoneNumbers[6]);
     }
+
+    private void assertContains(String needle, String haystack) {
+        assertTrue("'" + haystack + "' should contain '" + needle + "'", haystack.contains(needle));
+    }
 }