You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/09/19 16:16:29 UTC

svn commit: r1626226 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

Author: tpalsulich
Date: Fri Sep 19 14:16:29 2014
New Revision: 1626226

URL: http://svn.apache.org/r1626226
Log:
TIKA-93, create a new Tesseract OCR Parser.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.docx   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pdf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pptx   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1626226&r1=1626225&r2=1626226&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Sep 19 14:16:29 2014
@@ -3,6 +3,9 @@ Release 1.7 - Current Development
   * PackageParser includes the last-modified date from the archive
     in the metadata, when handling embedded entries (TIKA-1246)
 
+  * Created a new Tesseract OCR Parser to extract text from images.
+    Requires installation of Tesseract before use (TIKA-93).
+
 
 Release 1.6 - 08/31/2014
 

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1626226&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Fri Sep 19 14:16:29 2014
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.io.File;
+import java.io.Serializable;
+
+/**
+ * Configuration for TesseractOCRParser.
+ * 
+ * This allows to enable TesseractOCRParser and set its parameters:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ * 
+ * 
+ */
+public class TesseractOCRConfig implements Serializable{
+
+	private static final long serialVersionUID = -4861942486845757891L;
+	
+	// Path to tesseract installation folder, if not on system path.
+	private  String tesseractPath = "";
+	
+	// Language dictionary to be used.
+	private  String language = "eng";
+	
+	// Tesseract page segmentation mode.
+	private  String pageSegMode = "1";
+	
+	// Minimum file size to submit file to ocr.
+	private  int minFileSizeToOcr = 0;
+	
+	// Maximum file size to submit file to ocr.
+	private  int maxFileSizeToOcr = Integer.MAX_VALUE;
+	
+	// Maximum time (seconds) to wait for the ocring process termination
+	private int timeout = 120;
+	
+	/** @see #setTesseractPath(String tesseractPath)*/
+	public String getTesseractPath() {
+		return tesseractPath;
+	}
+	
+	/**
+	 * Set tesseract installation folder, needed if it is not on system path.
+	 */
+	public void setTesseractPath(String tesseractPath) {
+		if(!tesseractPath.endsWith(File.separator))
+			tesseractPath += File.separator;
+		
+		this.tesseractPath = tesseractPath;
+	}
+	
+	/** @see #setLanguage(String language)*/
+	public String getLanguage() {
+		return language;
+	}
+	
+	/**
+	 * Set tesseract language dictionary to be used. Default is "eng".
+	 * Multiple languages may be specified, separated by plus characters.
+	 */
+	public void setLanguage(String language) {
+		this.language = language;
+	}
+	
+	/** @see #setPageSegMode(String pageSegMode)*/
+	public String getPageSegMode() {
+		return pageSegMode;
+	}
+	
+	/**
+	 * Set tesseract page segmentation mode.
+	 * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+	 */
+	public void setPageSegMode(String pageSegMode) {
+		this.pageSegMode = pageSegMode;
+	}
+	
+	/** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+	public int getMinFileSizeToOcr() {
+		return minFileSizeToOcr;
+	}
+	
+	/**
+	 * Set minimum file size to submit file to ocr.
+	 * Default is 0.
+	 */
+	public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+		this.minFileSizeToOcr = minFileSizeToOcr;
+	}
+	
+	/** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+	public int getMaxFileSizeToOcr() {
+		return maxFileSizeToOcr;
+	}
+	
+	/**
+	 * Set maximum file size to submit file to ocr.
+	 * Default is Integer.MAX_VALUE.
+	 */
+	public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+		this.maxFileSizeToOcr = maxFileSizeToOcr;
+	}
+
+	/**
+	 * Set maximum time (seconds) to wait for the ocring process to terminate.
+	 * Default value is 120s.
+	 */
+	public void setTimeout(int timeout) {
+		this.timeout = timeout;
+	}
+
+	/** @see #setTimeout(int timeout)*/
+	public int getTimeout() {
+		return timeout;
+	}
+	
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1626226&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Fri Sep 19 14:16:29 2014
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.awt.Graphics2D;
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import javax.imageio.ImageIO;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * TesseractOCRParser powered by tesseract-ocr engine.
+ * To enable this parser, create a {@link TesseractOCRConfig}
+ * object and pass it through a ParseContext.
+ * Tesseract-ocr must be installed and on system path or
+ * the path to its root folder must be provided:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * //Needed if tesseract is not on system path<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ * 
+ * 
+ */
+public class TesseractOCRParser extends AbstractParser {
+	
+	private static final long serialVersionUID = 1L;
+	
+	private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+	
+	private static Set<MediaType> getTypes() {
+		HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+		
+		supportedTypes.add(MediaType.image("png"));
+		supportedTypes.add(MediaType.image("jpeg"));
+		supportedTypes.add(MediaType.image("tiff"));
+		supportedTypes.add(MediaType.image("x-ms-bmp"));
+		supportedTypes.add(MediaType.image("gif"));
+		
+		return supportedTypes;
+	}
+	
+	@Override
+	public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+		return SUPPORTED_TYPES;
+	}
+
+    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+        if(!config.getTesseractPath().isEmpty()){
+            Map<String, String> env = pb.environment();
+            env.put("TESSDATA_PREFIX", config.getTesseractPath());
+        }
+    }
+	
+	public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+		
+		TemporaryResources tmp = new TemporaryResources();
+		FileOutputStream fos = null;
+		TikaInputStream tis = null;
+		try{
+			int w = image.getWidth(null);
+	        int h = image.getHeight(null);
+	        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+	        Graphics2D g2 = bImage.createGraphics();
+	        g2.drawImage(image, 0, 0, null);
+	        g2.dispose();
+	        File file = tmp.createTemporaryFile();
+			fos = new FileOutputStream(file);
+			ImageIO.write(bImage, "png", fos);
+			bImage = null;
+			tis = TikaInputStream.get(file);
+			parse(tis, handler, metadata, context);
+			
+		}finally{
+			tmp.dispose();
+			if(tis != null)
+				tis.close();
+			if(fos != null)
+				fos.close();
+		}
+		
+		
+	}
+
+	@Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+    	TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+    	if(config == null) config = new TesseractOCRConfig();
+    	
+    	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+    	xhtml.startDocument();
+    	
+        TemporaryResources tmp = new TemporaryResources();
+        File output = null;
+        try {
+        	TikaInputStream  tikaStream = TikaInputStream.get(stream, tmp);
+        	File input = tikaStream.getFile();
+        	long size = tikaStream.getLength();
+        	
+        	if(size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()){
+        		
+            	output = tmp.createTemporaryFile();
+            	doOCR(input, output, config);
+            	
+                //Tesseract appends .txt to output file name
+                output = new File(output.getAbsolutePath() + ".txt");
+                
+                if(output.exists())
+                	extractOutput(new FileInputStream(output), xhtml);
+
+        	}
+        
+        } finally {
+        	tmp.dispose();
+        	if(output != null)
+        		output.delete();
+            
+        }
+        xhtml.endDocument();
+    }
+
+	/**
+	 * Run external tesseract-ocr process.
+	 * @param input File to be ocred
+     * @param output File to collect ocr result
+     * @param config Configuration of tesseract-ocr engine
+     * @throws TikaException if the extraction timed out
+     * @throws IOException if an input error occurred
+	 */
+    private void doOCR(File input, File output, TesseractOCRConfig config)
+            throws IOException, TikaException {
+        String[] cmd = {config.getTesseractPath() + "tesseract",
+    					input.getPath(), 
+						output.getPath() , 
+						"-l", 
+						config.getLanguage() , 
+						"-psm", 
+						config.getPageSegMode()	};
+            
+        ProcessBuilder pb = new ProcessBuilder(cmd);
+        setEnv(config, pb);
+        final Process process = pb.start();
+            
+        process.getOutputStream().close();
+        InputStream out = process.getInputStream();
+        InputStream err = process.getErrorStream();
+            
+        logStream("OCR MSG", out, input);
+        logStream("OCR ERROR", err, input);
+           
+        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+        	public Integer call() throws Exception {
+          	    return process.waitFor();
+          	}
+        });
+
+        Thread waitThread = new Thread(waitTask);
+        waitThread.start();
+          
+        try {
+        	waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+              
+        } catch (InterruptedException e) {
+        	waitThread.interrupt();
+          	process.destroy();
+          	Thread.currentThread().interrupt();
+          	throw new TikaException("TesseractOCRParser interrupted", e);
+          	
+        } catch (ExecutionException e) {
+			//should not be thrown
+				
+		} catch (TimeoutException e) {
+			waitThread.interrupt();
+			process.destroy();
+			throw new TikaException("TesseractOCRParser timeout", e);
+		}
+            	
+            
+    }
+    
+
+    /**
+     * Reads the contents of the given stream and write it to the 
+     * given XHTML content handler.
+     * The stream is closed once fully processed.
+     *
+     * @param stream Stream where is the result of ocr
+     * @param xhtml XHTML content handler
+     * @throws SAXException if the XHTML SAX events could not be handled
+     * @throws IOException if an input error occurred
+     */
+    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+            throws SAXException, IOException {
+    	
+        Reader reader = new InputStreamReader(stream, "UTF-8");
+        try {
+            xhtml.startElement("div");
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                xhtml.characters(buffer, 0, n);
+            }
+            xhtml.endElement("div");
+        } finally {
+            reader.close();
+        }
+    }
+
+    /**
+     * Starts a thread that reads the contents of the standard output
+     * or error stream of the given process to not block the process.
+     * The stream is closed once fully processed.
+     */
+    private void logStream(final String logType, final InputStream stream, final File file) {
+        new Thread() {
+            public void run() {
+            	Reader reader = new InputStreamReader(stream);
+                StringBuilder out = new StringBuilder();
+                char[] buffer = new char[1024];
+                try {
+					for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) 
+						out.append(buffer, 0, n);
+				} catch (IOException e) {
+					
+				} finally {
+                    IOUtils.closeQuietly(stream);
+                }
+			
+				
+				String msg = out.toString();
+				//log or discard message?
+				
+            }
+        }.start();
+    }
+
+	
+}
+
+

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java?rev=1626226&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java Fri Sep 19 14:16:29 2014
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assume.assumeTrue;
+
+public class TesseractOCRTest  extends TikaTest {
+
+    private boolean canRun(TesseractOCRConfig config) {
+        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
+        // If Tesseract is not on the path, do not run the test.
+        return ExternalParser.check(checkCmd);
+    }
+
+    @Test
+    public void testPDFOCR() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assumeTrue(canRun(config));
+
+        Parser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, new TesseractOCRParser());
+        parseContext.set(PDFParserConfig.class, pdfConfig);
+
+        InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+                "/test-documents/testOCR.pdf");
+
+        try {
+            parser.parse(stream, handler, metadata, parseContext);
+            assertTrue(handler.toString().contains("Happy New Year 2003!"));
+        } finally {
+            stream.close();
+        }
+    }
+
+    @Test
+    public void testDOCXOCR() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assumeTrue(canRun(config));
+
+        Parser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, new TesseractOCRParser());
+
+        InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+                "/test-documents/testOCR.docx");
+
+        try {
+            parser.parse(stream, handler, metadata, parseContext);
+
+            assertTrue(handler.toString().contains("Happy New Year 2003!"));
+            assertTrue(handler.toString().contains("This is some text."));
+            assertTrue(handler.toString().contains("Here is an embedded image:"));
+        } finally {
+            stream.close();
+        }
+    }
+
+    @Test
+    public void testPPTXOCR() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assumeTrue(canRun(config));
+
+        Parser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, new TesseractOCRParser());
+
+        InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+                "/test-documents/testOCR.pptx");
+
+        try {
+            parser.parse(stream, handler, metadata, parseContext);
+
+            assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
+            assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+        } finally {
+            stream.close();
+        }
+
+    }
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1626226&r1=1626225&r2=1626226&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Sep 19 14:16:29 2014
@@ -562,6 +562,8 @@ public class PDFParserTest extends TikaT
         Set<String> knownMetadataDiffs = new HashSet<String>();
         //PDFBox-1792/Tika-1203
         knownMetadataDiffs.add("testAnnotations.pdf");
+        // Added for TIKA-93.
+        knownMetadataDiffs.add("testOCR.pdf");
 
         //empty for now
         Set<String> knownContentDiffs = new HashSet<String>();

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.docx?rev=1626226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pdf?rev=1626226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pptx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pptx?rev=1626226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.pptx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream