You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2015/11/04 05:04:05 UTC
svn commit: r1712462 - in /tika/trunk:
tika-bundle/src/test/java/org/apache/tika/bundle/
tika-bundle/src/test/resources/
tika-core/src/main/resources/org/apache/tika/parser/
tika-core/src/main/resources/org/apache/tika/parser/external/
tika-parsers/src...
Author: bob
Date: Wed Nov 4 04:04:05 2015
New Revision: 1712462
URL: http://svn.apache.org/viewvc?rev=1712462&view=rev
Log:
TIKA-1507 - Moved tika-external-parsers.xml to tika-core to prevent OSGi split package issue.
Added:
tika/trunk/tika-bundle/src/test/resources/testOCR.jpg (with props)
tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/
tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/
tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
Removed:
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/external/
Modified:
tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Modified: tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java?rev=1712462&r1=1712461&r2=1712462&view=diff
==============================================================================
--- tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java (original)
+++ tika/trunk/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java Wed Nov 4 04:04:05 2015
@@ -51,6 +51,7 @@ import org.apache.tika.parser.DefaultPar
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.internal.Activator;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -234,6 +235,18 @@ public class BundleIT {
}
assertEquals(rawParsers, osgiParsers);
}
+
+ @Test
+ public void testTesseractParser() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ Parser tesseractParser = new TesseractOCRParser();
+ try(InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg"))
+ {
+ tesseractParser.parse(stream, handler, new Metadata(), context);
+ }
+
+ }
@Test
Added: tika/trunk/tika-bundle/src/test/resources/testOCR.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/src/test/resources/testOCR.jpg?rev=1712462&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-bundle/src/test/resources/testOCR.jpg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml?rev=1712462&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml (added)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml Wed Nov 4 04:04:05 2015
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Description: This xml file defines external commands to be run by Tika
+ as parsers.
+-->
+<external-parsers>
+ <!-- This example uses ffmpeg for video metadata extraction -->
+ <parser>
+ <check>
+ <command>ffmpeg -version</command>
+ <error-codes>126,127</error-codes>
+ </check>
+ <command>ffmpeg -i ${INPUT}</command>
+ <mime-types>
+ <mime-type>video/avi</mime-type>
+ <mime-type>video/mpeg</mime-type>
+ <mime-type>video/x-msvideo</mime-type>
+ </mime-types>
+ <metadata>
+ <match key="xmpDM:audioSampleRate">\s*Stream.*:.+Audio:.*,\s+(\d+)\s+Hz,.*</match>
+ <match key="xmpDM:audioChannelType">\s*Stream.*:.+Audio:.*\d+\s+Hz,\s+(\d{1,2})\s+channels.*</match>
+ <match key="xmpDM:audioCompressor">\s*Stream.*:.+Audio:\s+([A-Za-z0-9_\(\)/\[\] ]+),.*</match>
+ <match key="xmpDM:duration">\s*Duration:\s*([0-9:\.]+),.*</match>
+ <match key="xmpDM:fileDataRate">\s*Duration:.*,\s*bitrate:\s+([0-9A-Za-z/ ]+).*</match>
+ <match key="xmpDM:videoColorSpace">\s*Stream.*:\s+Video:\s+[A-Za-z0-9\(\)/ ]+,\s+([A-Za-z0-9\(\) ,]+),\s+[0-9x]+,.*</match>
+ <match key="xmpDM:videoCompressor">\s*Stream.*:\s+Video:\s+([A-Za-z0-9\(\)/ ]+),.*</match>
+ <match key="xmpDM:videoFrameRate">\s*Stream.*:\s+Video:.*,\s+([0-9]+)\s+fps,.*</match>
+ <match key="encoder">\s*encoder\s*\:\s*(\w+).*</match>
+ <match key="videoResolution">\s*Stream.*:\s+Video:.*,\s+([0-9x]+),.*</match>
+ </metadata>
+ </parser>
+ <parser>
+ <check>
+ <command>exiftool -ver</command>
+ <error-codes>126,127</error-codes>
+ </check>
+ <command>env FOO=${OUTPUT} exiftool ${INPUT}</command>
+ <mime-types>
+ <mime-type>video/avi</mime-type>
+ <mime-type>video/mpeg</mime-type>
+ <mime-type>video/x-msvideo</mime-type>
+ <mime-type>video/mp4</mime-type>
+ </mime-types>
+ <metadata>
+ <match>\s*([A-Za-z0-9/ \(\)]+\S{1})\s+:\s+([A-Za-z0-9\(\)\[\] \:\-\.]+)\s*</match>
+ </metadata>
+ </parser>
+</external-parsers>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1712462&r1=1712461&r2=1712462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Nov 4 04:04:05 2015
@@ -121,17 +121,10 @@ public class TesseractOCRParser extends
// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = { tesseract };
- try {
- boolean hasTesseract = ExternalParser.check(checkCmd);
- TESSERACT_PRESENT.put(tesseract, hasTesseract);
- return hasTesseract;
- } catch (NoClassDefFoundError e) {
- // This happens under OSGi + Fork Parser - see TIKA-1507
- // As a workaround for now, just say we can't use OCR
- // TODO Resolve it so we don't need this try/catch block
- TESSERACT_PRESENT.put(tesseract, false);
- return false;
- }
+ boolean hasTesseract = ExternalParser.check(checkCmd);
+ TESSERACT_PRESENT.put(tesseract, hasTesseract);
+ return hasTesseract;
+
}
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,