You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2015/11/04 05:04:05 UTC

svn commit: r1712462 - in /tika/trunk: tika-bundle/src/test/java/org/apache/tika/bundle/ tika-bundle/src/test/resources/ tika-core/src/main/resources/org/apache/tika/parser/ tika-core/src/main/resources/org/apache/tika/parser/external/ tika-parsers/src...

Author: bob
Date: Wed Nov  4 04:04:05 2015
New Revision: 1712462

URL: http://svn.apache.org/viewvc?rev=1712462&view=rev
Log:
TIKA-1507 - Moved tika-external-parsers.xml to tika-core to prevent OSGi split package issue.  

Added:
    tika/trunk/tika-bundle/src/test/resources/testOCR.jpg   (with props)
    tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/
    tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/
    tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
Removed:
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/external/
Modified:
    tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Modified: tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java?rev=1712462&r1=1712461&r2=1712462&view=diff
==============================================================================
--- tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java (original)
+++ tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java Wed Nov  4 04:04:05 2015
@@ -51,6 +51,7 @@ import org.apache.tika.parser.DefaultPar
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.internal.Activator;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -234,6 +235,18 @@ public class BundleIT {
         }
         assertEquals(rawParsers, osgiParsers);
     }
+    
+    @Test
+    public void testTesseractParser() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+        Parser tesseractParser = new TesseractOCRParser();
+        try(InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg"))
+        {
+            tesseractParser.parse(stream, handler, new Metadata(), context);
+        }
+        
+    }
 
 
     @Test

Added: tika/trunk/tika-bundle/src/test/resources/testOCR.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/resources/testOCR.jpg?rev=1712462&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-bundle/src/test/resources/testOCR.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml?rev=1712462&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml (added)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml Wed Nov  4 04:04:05 2015
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!--
+  Description: This xml file defines external commands to be run by Tika
+  as parsers.
+-->
+<external-parsers>
+  <!-- This example uses ffmpeg for video metadata extraction -->
+  <parser>
+     <check>
+       <command>ffmpeg -version</command>
+       <error-codes>126,127</error-codes>
+     </check>
+     <command>ffmpeg -i ${INPUT}</command>
+     <mime-types>
+       <mime-type>video/avi</mime-type>
+       <mime-type>video/mpeg</mime-type>
+       <mime-type>video/x-msvideo</mime-type>
+     </mime-types>
+     <metadata>
+       <match key="xmpDM:audioSampleRate">\s*Stream.*:.+Audio:.*,\s+(\d+)\s+Hz,.*</match>
+       <match key="xmpDM:audioChannelType">\s*Stream.*:.+Audio:.*\d+\s+Hz,\s+(\d{1,2})\s+channels.*</match>
+       <match key="xmpDM:audioCompressor">\s*Stream.*:.+Audio:\s+([A-Za-z0-9_\(\)/\[\] ]+),.*</match>
+       <match key="xmpDM:duration">\s*Duration:\s*([0-9:\.]+),.*</match>
+       <match key="xmpDM:fileDataRate">\s*Duration:.*,\s*bitrate:\s+([0-9A-Za-z/ ]+).*</match>
+       <match key="xmpDM:videoColorSpace">\s*Stream.*:\s+Video:\s+[A-Za-z0-9\(\)/ ]+,\s+([A-Za-z0-9\(\) ,]+),\s+[0-9x]+,.*</match>
+       <match key="xmpDM:videoCompressor">\s*Stream.*:\s+Video:\s+([A-Za-z0-9\(\)/ ]+),.*</match>
+       <match key="xmpDM:videoFrameRate">\s*Stream.*:\s+Video:.*,\s+([0-9]+)\s+fps,.*</match>
+       <match key="encoder">\s*encoder\s*\:\s*(\w+).*</match>
+       <match key="videoResolution">\s*Stream.*:\s+Video:.*,\s+([0-9x]+),.*</match>
+     </metadata>
+  </parser>
+  <parser>
+     <check>
+       <command>exiftool -ver</command>
+       <error-codes>126,127</error-codes>
+     </check>
+     <command>env FOO=${OUTPUT} exiftool ${INPUT}</command>
+     <mime-types>
+       <mime-type>video/avi</mime-type>
+       <mime-type>video/mpeg</mime-type>
+       <mime-type>video/x-msvideo</mime-type>
+       <mime-type>video/mp4</mime-type>
+     </mime-types>
+     <metadata>
+       <match>\s*([A-Za-z0-9/ \(\)]+\S{1})\s+:\s+([A-Za-z0-9\(\)\[\] \:\-\.]+)\s*</match>
+     </metadata>
+  </parser>
+</external-parsers>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1712462&r1=1712461&r2=1712462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Nov  4 04:04:05 2015
@@ -121,17 +121,10 @@ public class TesseractOCRParser extends
 
         // Try running Tesseract from there, and see if it exists + works
         String[] checkCmd = { tesseract };
-        try {
-            boolean hasTesseract = ExternalParser.check(checkCmd);
-            TESSERACT_PRESENT.put(tesseract, hasTesseract);
-            return hasTesseract;
-        } catch (NoClassDefFoundError e) {
-            // This happens under OSGi + Fork Parser - see TIKA-1507
-            // As a workaround for now, just say we can't use OCR
-            // TODO Resolve it so we don't need this try/catch block
-            TESSERACT_PRESENT.put(tesseract, false);
-            return false;
-        }
+        boolean hasTesseract = ExternalParser.check(checkCmd);
+        TESSERACT_PRESENT.put(tesseract, hasTesseract);
+        return hasTesseract;
+     
     }
 
     public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,