You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [5/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ tik...

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE Wed Jan  6 03:50:50 2016
@@ -0,0 +1,14 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp)
+    are in the public domain.  These files were retrieved from:
+    https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp
+    These photos are also available here:
+    https://developers.google.com/speed/webp/gallery2#webp_links
+    Credits for the photo:
+    "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers"
+    Image Author: Jon Sullivan

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.fontbox.afm.AFMParser;
+import org.apache.fontbox.afm.FontMetric;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for AFM Font Files
+ */
+public class AdobeFontMetricParser extends AbstractParser { 
+    /** Serial version UID */
+    private static final long serialVersionUID = -4820306522217196835L;
+
+    private static final MediaType AFM_TYPE =
+         MediaType.application( "x-font-adobe-metric" );
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(AFM_TYPE);
+
+    // TIKA-1325 Replace these with properties, from a well known standard
+    static final String MET_AVG_CHAR_WIDTH = "AvgCharacterWidth";
+    static final String MET_DOC_VERSION = "DocVersion";
+    static final String MET_PS_NAME = "PSName";
+    static final String MET_FONT_NAME = "FontName";
+    static final String MET_FONT_FULL_NAME = "FontFullName";
+    static final String MET_FONT_FAMILY_NAME = "FontFamilyName";
+    static final String MET_FONT_SUB_FAMILY_NAME = "FontSubFamilyName";
+    static final String MET_FONT_VERSION = "FontVersion";
+    static final String MET_FONT_WEIGHT = "FontWeight";
+    static final String MET_FONT_NOTICE = "FontNotice";
+    static final String MET_FONT_UNDERLINE_THICKNESS = "FontUnderlineThickness";
+    
+    public Set<MediaType> getSupportedTypes( ParseContext context ) { 
+       return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context)
+                      throws IOException, SAXException, TikaException { 
+       FontMetric fontMetrics;
+       AFMParser  parser      = new AFMParser( stream );
+
+       // Have FontBox process the file
+       parser.parse();
+       fontMetrics = parser.getResult();
+
+       // Get the comments in the file to display in xhtml
+       List<String> comments = fontMetrics.getComments();
+
+       // Get the creation date
+       extractCreationDate( metadata, comments );
+
+       metadata.set( Metadata.CONTENT_TYPE, AFM_TYPE.toString() );
+       metadata.set( TikaCoreProperties.TITLE, fontMetrics.getFullName() );
+
+       // Add metadata associated with the font type
+       addMetadataByString( metadata, MET_AVG_CHAR_WIDTH, Float.toString( fontMetrics.getAverageCharacterWidth() ) );
+       addMetadataByString( metadata, MET_DOC_VERSION, Float.toString( fontMetrics.getAFMVersion() ) );
+       addMetadataByString( metadata, MET_FONT_NAME, fontMetrics.getFontName() );
+       addMetadataByString( metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName() );
+       addMetadataByString( metadata, MET_FONT_FAMILY_NAME, fontMetrics.getFamilyName() );
+       addMetadataByString( metadata, MET_FONT_VERSION, fontMetrics.getFontVersion() );
+       addMetadataByString( metadata, MET_FONT_WEIGHT, fontMetrics.getWeight() );
+       addMetadataByString( metadata, MET_FONT_NOTICE, fontMetrics.getNotice() );
+       addMetadataByString( metadata, MET_FONT_UNDERLINE_THICKNESS, Float.toString( fontMetrics.getUnderlineThickness() ) );
+
+       // Output the remaining comments as text
+       XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata );
+       xhtml.startDocument();
+
+       // Display the comments
+       if (comments.size() > 0) {
+          xhtml.element( "h1", "Comments" );
+          xhtml.startElement("div", "class", "comments");
+          for (String comment : comments) {
+              xhtml.element( "p", comment );
+          }
+          xhtml.endElement("div");
+       }
+
+       xhtml.endDocument();
+    }
+
+    private void addMetadataByString( Metadata metadata, String name, String value ) { 
+       // Add metadata if an appropriate value is passed 
+       if (value != null) { 
+          metadata.add( name, value );
+       }
+    }
+
+    private void addMetadataByProperty( Metadata metadata, Property property, String value ) { 
+       // Add metadata if an appropriate value is passed 
+       if (value != null) 
+       {
+          metadata.set( property, value );
+       }
+    }
+
+
+    private void extractCreationDate( Metadata metadata, List<String> comments ) {
+       String   date = null;
+
+       for (String value : comments) {
+          // Look for the creation date
+          if( value.matches( ".*Creation\\sDate.*" ) ) {
+             date = value.substring( value.indexOf( ":" ) + 2 );
+             comments.remove( value );
+
+             break;
+          }
+       }
+
+       // If appropriate date then store as metadata
+       if( date != null ) {
+          addMetadataByProperty( metadata, Metadata.CREATION_DATE, date );
+       }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.NameRecord;
+import org.apache.fontbox.ttf.NamingTable;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for TrueType font files (TTF).
+ */
+public class TrueTypeParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 44788554612243032L;
+
+    private static final MediaType TYPE =
+        MediaType.application("x-font-ttf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TikaInputStream tis = TikaInputStream.cast(stream);
+        
+        // Ask FontBox to parse the file for us
+        TrueTypeFont font;
+        TTFParser parser = new TTFParser();
+        if (tis != null && tis.hasFile()) {
+            font = parser.parseTTF(tis.getFile());
+        } else {
+            font = parser.parseTTF(stream);
+        }
+
+        // Report the details of the font
+        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+        metadata.set(TikaCoreProperties.CREATED, 
+                font.getHeader().getCreated());
+        metadata.set(TikaCoreProperties.MODIFIED,
+                font.getHeader().getModified());
+        metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
+                Float.toString(font.getHeader().getVersion()));
+        
+        // Pull out the naming info
+        NamingTable fontNaming = font.getNaming();
+        for (NameRecord nr : fontNaming.getNameRecords()) {
+            if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
+                metadata.set(TikaCoreProperties.TITLE, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
+                metadata.set("Copyright", nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
+                metadata.set("Trademark", nr.getString());
+            }
+        }
+        
+        // For now, we only output metadata, no textual contents
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Configuration for TesseractOCRParser.
+ *
+ * This allows to enable TesseractOCRParser and set its parameters:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ *
+ * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
+ * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
+ * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ *
+ */
+public class TesseractOCRConfig implements Serializable{
+
+	private static final long serialVersionUID = -4861942486845757891L;
+
+	// Path to tesseract installation folder, if not on system path.
+	private  String tesseractPath = "";
+
+    // Path to the 'tessdata' folder, which contains language files and config files.
+    private String tessdataPath = "";
+
+	// Language dictionary to be used.
+	private  String language = "eng";
+
+	// Tesseract page segmentation mode.
+	private  String pageSegMode = "1";
+
+	// Minimum file size to submit file to ocr.
+	private  int minFileSizeToOcr = 0;
+
+	// Maximum file size to submit file to ocr.
+	private  int maxFileSizeToOcr = Integer.MAX_VALUE;
+
+	// Maximum time (seconds) to wait for the ocring process termination
+	private int timeout = 120;
+
+	/**
+	 * Default contructor.
+	 */
+	public TesseractOCRConfig() {
+		init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
+	}
+
+	/**
+	 * Loads properties from InputStream and then tries to close InputStream.
+	 * If there is an IOException, this silently swallows the exception
+	 * and goes back to the default.
+	 *
+	 * @param is
+	 */
+	public TesseractOCRConfig(InputStream is) {
+		init(is);
+	}
+
+	private void init(InputStream is) {
+		if (is == null) {
+			return;
+		}
+		Properties props = new Properties();
+		try {
+			props.load(is);
+		} catch (IOException e) {
+		} finally {
+			if (is != null) {
+				try {
+					is.close();
+				} catch (IOException e) {
+					//swallow
+				}
+			}
+		}
+
+		setTesseractPath(
+				getProp(props, "tesseractPath", getTesseractPath()));
+        setTessdataPath(
+                getProp(props, "tessdataPath", getTessdataPath()));
+		setLanguage(
+				getProp(props, "language", getLanguage()));
+		setPageSegMode(
+				getProp(props, "pageSegMode", getPageSegMode()));
+		setMinFileSizeToOcr(
+				getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
+		setMaxFileSizeToOcr(
+				getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
+		setTimeout(
+                getProp(props, "timeout", getTimeout()));
+
+	}
+
+	/** @see #setTesseractPath(String tesseractPath)*/
+	public String getTesseractPath() {
+		return tesseractPath;
+	}
+
+	/**
+	 * Set the path to the Tesseract executable, needed if it is not on system path.
+     * <p>
+     * Note that if you set this value, it is highly recommended that you also
+     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+     * </p>
+	 */
+	public void setTesseractPath(String tesseractPath) {
+		if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
+			tesseractPath += File.separator;
+
+		this.tesseractPath = tesseractPath;
+	}
+
+    /** @see #setTessdataPath(String tessdataPath) */
+    public String getTessdataPath() {
+        return tessdataPath;
+    }
+
+    /**
+     * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+     * as on Windows), this folder is found in the Tesseract installation, but in other cases
+     * (such as when Tesseract is built from source), it may be located elsewhere.
+     */
+    public void setTessdataPath(String tessdataPath) {
+        if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+            tessdataPath += File.separator;
+
+        this.tessdataPath = tessdataPath;
+    }
+
+	/** @see #setLanguage(String language)*/
+	public String getLanguage() {
+		return language;
+	}
+
+	/**
+	 * Set tesseract language dictionary to be used. Default is "eng".
+	 * Multiple languages may be specified, separated by plus characters.
+	 */
+	public void setLanguage(String language) {
+		if (!language.matches("([A-Za-z](\\+?))*")) {
+			throw new IllegalArgumentException("Invalid language code");
+		}
+		this.language = language;
+	}
+
+	/** @see #setPageSegMode(String pageSegMode)*/
+	public String getPageSegMode() {
+		return pageSegMode;
+	}
+
+	/**
+	 * Set tesseract page segmentation mode.
+	 * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+	 */
+	public void setPageSegMode(String pageSegMode) {
+		if (!pageSegMode.matches("[1-9]|10")) {
+			throw new IllegalArgumentException("Invalid language code");
+		}
+		this.pageSegMode = pageSegMode;
+	}
+
+	/** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+	public int getMinFileSizeToOcr() {
+		return minFileSizeToOcr;
+	}
+
+	/**
+	 * Set minimum file size to submit file to ocr.
+	 * Default is 0.
+	 */
+	public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+		this.minFileSizeToOcr = minFileSizeToOcr;
+	}
+
+	/** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+	public int getMaxFileSizeToOcr() {
+		return maxFileSizeToOcr;
+	}
+
+	/**
+	 * Set maximum file size to submit file to ocr.
+	 * Default is Integer.MAX_VALUE.
+	 */
+	public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+		this.maxFileSizeToOcr = maxFileSizeToOcr;
+	}
+
+	/**
+	 * Set maximum time (seconds) to wait for the ocring process to terminate.
+	 * Default value is 120s.
+	 */
+	public void setTimeout(int timeout) {
+		this.timeout = timeout;
+	}
+
+	/** @see #setTimeout(int timeout)*/
+	public int getTimeout() {
+		return timeout;
+	}
+
+	/**
+	 * Get property from the properties file passed in.
+	 * @param properties properties file to read from.
+	 * @param property the property to fetch.
+	 * @param defaultMissing default parameter to use.
+	 * @return the value.
+	 */
+	private int getProp(Properties properties, String property, int defaultMissing) {
+		String p = properties.getProperty(property);
+		if (p == null || p.isEmpty()){
+			return defaultMissing;
+		}
+		try {
+			return Integer.parseInt(p);
+		} catch (Throwable ex) {
+			throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
+					property), ex);
+		}
+	}
+
+	/**
+	 * Get property from the properties file passed in.
+	 * @param properties properties file to read from.
+	 * @param property the property to fetch.
+	 * @param defaultMissing default parameter to use.
+	 * @return the value.
+	 */
+	private String getProp(Properties properties, String property, String defaultMissing) {
+		return properties.getProperty(property, defaultMissing);
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import javax.imageio.ImageIO;
+
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
+ * create a {@link TesseractOCRConfig} object and pass it through a
+ * ParseContext. Tesseract-ocr must be installed and on system path or the path
+ * to its root folder must be provided:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * //Needed if tesseract is not on system path<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ *
+ *
+ */
+public class TesseractOCRParser extends AbstractParser {
+    private static final long serialVersionUID = -8167538283213097265L;
+    private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+            new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+                    MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
+                    MediaType.image("x-ms-bmp"), MediaType.image("gif")
+            })));
+    private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        // If Tesseract is installed, offer our supported image types
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+        if (hasTesseract(config))
+            return SUPPORTED_TYPES;
+
+        // Otherwise don't advertise anything, so the other image parsers
+        //  can be selected instead
+        return Collections.emptySet();
+    }
+
+    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+        String tessdataPrefix = "TESSDATA_PREFIX";
+        Map<String, String> env = pb.environment();
+
+        if (!config.getTessdataPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTessdataPath());
+        }
+        else if(!config.getTesseractPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTesseractPath());
+        }
+    }
+
+    private boolean hasTesseract(TesseractOCRConfig config) {
+        // Fetch where the config says to find Tesseract
+        String tesseract = config.getTesseractPath() + getTesseractProg();
+
+        // Have we already checked for a copy of Tesseract there?
+        if (TESSERACT_PRESENT.containsKey(tesseract)) {
+            return TESSERACT_PRESENT.get(tesseract);
+        }
+
+        // Try running Tesseract from there, and see if it exists + works
+        String[] checkCmd = { tesseract };
+        boolean hasTesseract = ExternalParser.check(checkCmd);
+        TESSERACT_PRESENT.put(tesseract, hasTesseract);
+        return hasTesseract;
+     
+    }
+
+    public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        TemporaryResources tmp = new TemporaryResources();
+        FileOutputStream fos = null;
+        TikaInputStream tis = null;
+        try {
+            int w = image.getWidth(null);
+            int h = image.getHeight(null);
+            BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+            File file = tmp.createTemporaryFile();
+            fos = new FileOutputStream(file);
+            ImageIO.write(bImage, "png", fos);
+            tis = TikaInputStream.get(file);
+            parse(tis, handler, metadata, context);
+
+        } finally {
+            tmp.dispose();
+            if (tis != null)
+                tis.close();
+            if (fos != null)
+                fos.close();
+        }
+
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+        // If Tesseract is not on the path with the current config, do not try to run OCR
+        // getSupportedTypes shouldn't have listed us as handling it, so this should only
+        //  occur if someone directly calls this parser, not via DefaultParser or similar
+        if (! hasTesseract(config))
+            return;
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        TemporaryResources tmp = new TemporaryResources();
+        File output = null;
+        try {
+            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+            File input = tikaStream.getFile();
+            long size = tikaStream.getLength();
+
+            if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+                output = tmp.createTemporaryFile();
+                doOCR(input, output, config);
+
+                // Tesseract appends .txt to output file name
+                output = new File(output.getAbsolutePath() + ".txt");
+
+                if (output.exists())
+                    extractOutput(new FileInputStream(output), xhtml);
+
+            }
+
+            // Temporary workaround for TIKA-1445 - until we can specify
+            //  composite parsers with strategies (eg Composite, Try In Turn),
+            //  always send the image onwards to the regular parser to have
+            //  the metadata for them extracted as well
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
+        } finally {
+            tmp.dispose();
+            if (output != null) {
+                output.delete();
+            }
+        }
+    }
+    // TIKA-1445 workaround parser
+    private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
+    private static class CompositeImageParser extends CompositeParser {
+        private static final long serialVersionUID = -2398203346206381382L;
+        private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
+                new ImageParser(), new JpegParser(), new TiffParser()
+        });
+        CompositeImageParser() {
+            super(new MediaTypeRegistry(), imageParsers);
+        }
+    }
+
+    /**
+     * Run external tesseract-ocr process.
+     *
+     * @param input
+     *          File to be ocred
+     * @param output
+     *          File to collect ocr result
+     * @param config
+     *          Configuration of tesseract-ocr engine
+     * @throws TikaException
+     *           if the extraction timed out
+     * @throws IOException
+     *           if an input error occurred
+     */
+    private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+                config.getLanguage(), "-psm", config.getPageSegMode() };
+
+        ProcessBuilder pb = new ProcessBuilder(cmd);
+        setEnv(config, pb);
+        final Process process = pb.start();
+
+        process.getOutputStream().close();
+        InputStream out = process.getInputStream();
+        InputStream err = process.getErrorStream();
+
+        logStream("OCR MSG", out, input);
+        logStream("OCR ERROR", err, input);
+
+        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+            public Integer call() throws Exception {
+                return process.waitFor();
+            }
+        });
+
+        Thread waitThread = new Thread(waitTask);
+        waitThread.start();
+
+        try {
+            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+        } catch (InterruptedException e) {
+            waitThread.interrupt();
+            process.destroy();
+            Thread.currentThread().interrupt();
+            throw new TikaException("TesseractOCRParser interrupted", e);
+
+        } catch (ExecutionException e) {
+            // should not be thrown
+
+        } catch (TimeoutException e) {
+            waitThread.interrupt();
+            process.destroy();
+            throw new TikaException("TesseractOCRParser timeout", e);
+        }
+
+    }
+
+    /**
+     * Reads the contents of the given stream and write it to the given XHTML
+     * content handler. The stream is closed once fully processed.
+     *
+     * @param stream
+     *          Stream where is the result of ocr
+     * @param xhtml
+     *          XHTML content handler
+     * @throws SAXException
+     *           if the XHTML SAX events could not be handled
+     * @throws IOException
+     *           if an input error occurred
+     */
+    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+        xhtml.startDocument();
+        xhtml.startElement("div");
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                if (n > 0)
+                    xhtml.characters(buffer, 0, n);
+            }
+        }
+        xhtml.endElement("div");
+        xhtml.endDocument();
+    }
+
+    /**
+     * Starts a thread that reads the contents of the standard output or error
+     * stream of the given process to not block the process. The stream is closed
+     * once fully processed.
+     */
+    private void logStream(final String logType, final InputStream stream, final File file) {
+        new Thread() {
+            public void run() {
+                Reader reader = new InputStreamReader(stream, UTF_8);
+                StringBuilder out = new StringBuilder();
+                char[] buffer = new char[1024];
+                try {
+                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+                        out.append(buffer, 0, n);
+                } catch (IOException e) {
+
+                } finally {
+                    IOUtils.closeQuietly(stream);
+                }
+
+                String msg = out.toString();
+                LogFactory.getLog(TesseractOCRParser.class).debug(msg);
+            }
+        }.start();
+    }
+
+    static String getTesseractProg() {
+        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
+    }
+
+}

Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&r1=1723222&r2=1723223&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan  6 03:50:50 2016
@@ -14,6 +14,8 @@
 #  limitations under the License.
 
 
+org.apache.tika.parser.font.AdobeFontMetricParser
+org.apache.tika.parser.font.TrueTypeParser
 org.apache.tika.parser.image.BPGParser
 org.apache.tika.parser.image.ImageParser
 org.apache.tika.parser.image.PSDParser
@@ -22,6 +24,7 @@ org.apache.tika.parser.image.WebPParser
 org.apache.tika.parser.jpeg.JpegParser
 org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
+org.apache.tika.parser.ocr.TesseractOCRParser
 org.apache.tika.parser.mp3.Mp3Parser
 org.apache.tika.parser.mp4.MP4Parser
 org.apache.tika.parser.video.FLVParser

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties Wed Jan  6 03:50:50 2016
@@ -0,0 +1,21 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+tesseractPath=
+language=eng
+pageSegMode=1
+maxFileSizeToOcr=2147483647
+minFileSizeToOcr=0
+timeout=120
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FAMILY_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FULL_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_VERSION;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_WEIGHT;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_PS_NAME;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing various different font files.
+ */
+public class FontParsersTest {
+    @Test
+    public void testAdobeFontMetricParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try (TikaInputStream stream = TikaInputStream.get(
+                FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
+        
+        assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
+        assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
+        assertEquals("TestSymbol",   metadata.get(MET_FONT_FAMILY_NAME));
+        
+        assertEquals("Medium",  metadata.get(MET_FONT_WEIGHT));
+        assertEquals("001.008", metadata.get(MET_FONT_VERSION));
+
+        String content = handler.toString();
+
+        // Test that the comments got extracted
+        assertContains("Comments", content);
+        assertContains("This is a comment in a sample file", content);
+        assertContains("UniqueID 12345", content);
+    }
+    
+    @Test
+    public void testTTFParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        //Open Sans font is ASL 2.0 according to 
+        //http://www.google.com/fonts/specimen/Open+Sans
+        //...despite the copyright in the file's metadata.
+
+        try (TikaInputStream stream = TikaInputStream.get(
+                FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
+        
+        assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
+        assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
+        assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
+        assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
+        
+        assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
+        assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
+        
+        // Not extracted
+        assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
+        assertEquals(null, metadata.get(MET_FONT_WEIGHT));
+        assertEquals(null, metadata.get(MET_FONT_VERSION));
+
+        // Currently, the parser doesn't extract any contents
+        String content = handler.toString();
+        assertEquals("", content);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class TesseractOCRConfigTest extends TikaTest {
+
+    @Test
+    public void testNoConfig() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+        assertEquals("Invalid default language value", "eng", config.getLanguage());
+        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+        assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
+        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid default timeout value", 120, config.getTimeout());
+    }
+
+    @Test
+    public void testPartialConfig() throws Exception {
+
+        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+                "/test-properties/TesseractOCRConfig-partial.properties");
+
+        TesseractOCRConfig config = new TesseractOCRConfig(stream);
+        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+    }
+
+    @Test
+    public void testFullConfig() throws Exception {
+
+        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+                "/test-properties/TesseractOCRConfig-full.properties");
+
+        TesseractOCRConfig config = new TesseractOCRConfig(stream);
+        assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+        assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
+        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+        assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
+        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+        assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidateLanguage() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setLanguage("eng");
+        config.setLanguage("eng+fra");
+        assertTrue("Couldn't set valid values", true);
+        config.setLanguage("rm -Rf *");
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidatePageSegMode() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setPageSegMode("0");
+        config.setPageSegMode("10");
+        assertTrue("Couldn't set valid values", true);
+        config.setPageSegMode("11");
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.mail.RFC822Parser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TesseractOCRParserTest extends TikaTest {
+
+    public static boolean canRun() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
+        return tesseractOCRTest.canRun(config);
+    }
+
+    private boolean canRun(TesseractOCRConfig config) {
+        String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
+        // If Tesseract is not on the path, do not run the test.
+        return ExternalParser.check(checkCmd);
+    }
+
+    /*
+    Check that if Tesseract is not found, the TesseractOCRParser claims to not support
+    any file types. So, the standard image parser is called instead.
+     */
+    @Test
+    public void offersNoTypesIfNotFound() throws Exception {
+        TesseractOCRParser parser = new TesseractOCRParser();
+        DefaultParser defaultParser = new DefaultParser();
+        MediaType png = MediaType.image("png");
+
+        // With an invalid path, will offer no types
+        TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
+        invalidConfig.setTesseractPath("/made/up/path");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, invalidConfig);
+
+        // No types offered
+        assertEquals(0, parser.getSupportedTypes(parseContext).size());
+
+        // And DefaultParser won't use us
+        assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+    }
+
+    /*
+    If Tesseract is found, test we retrieve the proper number of supporting Parsers.
+     */
+    @Test
+    public void offersTypesIfFound() throws Exception {
+        TesseractOCRParser parser = new TesseractOCRParser();
+        DefaultParser defaultParser = new DefaultParser();
+
+        ParseContext parseContext = new ParseContext();
+        MediaType png = MediaType.image("png");
+
+        // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
+        assumeTrue(canRun());
+
+        assertEquals(5, parser.getSupportedTypes(parseContext).size());
+        assertTrue(parser.getSupportedTypes(parseContext).contains(png));
+
+        // DefaultParser will now select the TesseractOCRParser.
+        assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+    }
+
+    @Test
+    public void testPDFOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pdf";
+        String[] nonOCRContains = new String[0];
+        testBasicOCR(resource, nonOCRContains, 2);
+    }
+
+    @Test
+    public void testDOCXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.docx";
+        String[] nonOCRContains = {
+                "This is some text.",
+                "Here is an embedded image:"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
+    }
+
+    @Test
+    public void testPPTXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pptx";
+        String[] nonOCRContains = {
+                "This is some text"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
+    }
+
+    private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+                new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, parser);
+        parseContext.set(PDFParserConfig.class, pdfConfig);
+
+        try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
+            parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
+        }
+        List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
+        assertEquals(numMetadatas, metadataList.size());
+
+        StringBuilder contents = new StringBuilder();
+        for (Metadata m : metadataList) {
+            contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+        }
+        if (canRun()) {
+            assertTrue(contents.toString().contains("Happy New Year 2003!"));
+        }
+        for (String needle : nonOCRContains) {
+            assertContains(needle, contents.toString());
+        }
+        assertTrue(metadataList.get(0).names().length > 10);
+        assertTrue(metadataList.get(1).names().length > 10);
+        //test at least one value
+        assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+    }
+
+    @Test
+    public void testSingleImage() throws Exception {
+        assumeTrue(canRun());
+        String xml = getXML("testOCR.jpg").xml;
+        assertContains("OCR Testing", xml);
+    }
+
+    @Test
+    public void getNormalMetadataToo() throws Exception {
+        //this should be successful whether or not TesseractOCR is installed/active
+        //If tesseract is installed, the internal metadata extraction parser should
+        //work; and if tesseract isn't installed, the regular parsers should take over.
+
+        //gif
+        Metadata m = getXML("testGIF.gif").metadata;
+        assertTrue(m.names().length > 20);
+        assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+        //jpg
+        m = getXML("testOCR.jpg").metadata;
+        assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+        assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
+
+        //bmp
+        m = getXML("testBMP.bmp").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+        //png
+        m = getXML("testPNG.png").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+        //tiff
+        m = getXML("testTIFF.tif").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("72 dots per inch", m.get("Y Resolution"));
+    }
+    
+    @Test
+    public void testMultipart() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-multipart");
+        ContentHandler handler = mock(XHTMLContentHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+            // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked.
+            // But, different versions of Tesseract lead to a different number of invocations. So, we
+            // only verify the handler if Tesseract cannot run.
+            if (!TesseractOCRParserTest.canRun()) {
+                verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+                verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            }
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+
+        //repeat, this time looking at content
+        parser = new RFC822Parser();
+        metadata = new Metadata();
+        stream = getStream("test-documents/testRFC822-multipart");
+        handler = new BodyContentHandler();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("body 1"));
+            assertTrue(bodyText.contains("body 2"));
+            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+    
+    private static InputStream getStream(String name) {
+        InputStream stream = Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream(name);
+        assertNotNull("Test file not found " + name, stream);
+        return stream;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml Wed Jan  6 03:50:50 2016
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-office-module</artifactId>
+  <name>Apache Tika Office Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>${poi.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-scratchpad</artifactId>
+      <version>${poi.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-ooxml</artifactId>
+      <version>${poi.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>stax</groupId>
+          <artifactId>stax-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>xml-apis</groupId>
+          <artifactId>xml-apis</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.healthmarketscience.jackcess</groupId>
+      <artifactId>jackcess</artifactId>
+      <version>2.1.2</version>
+    </dependency>
+    <dependency>
+      <groupId>com.healthmarketscience.jackcess</groupId>
+      <artifactId>jackcess-encrypt</artifactId>
+      <version>2.1.1</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-package-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-web-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 5938777307516469802L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-htmlhelp"),
+                    MediaType.application("chm"),
+                    MediaType.application("x-chm"))));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        ChmExtractor chmExtractor = new ChmExtractor(stream);
+
+        // metadata
+        metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
+
+        // content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
+            final String entryName = entry.getName();
+            if (entryName.endsWith(".html") 
+                    || entryName.endsWith(".htm")
+            ) {
+//                AttributesImpl attrs = new AttributesImpl();
+//                attrs.addAttribute("", "name", "name", "String", entryName);
+//                xhtml.startElement("", "document", "document", attrs);
+                
+                byte[] data = chmExtractor.extractChmEntry(entry);
+
+                parsePage(data, xhtml);
+                
+//                xhtml.endElement("", "", "document");
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+
+    private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
+        InputStream stream = null;
+        Metadata metadata = new Metadata();
+        HtmlParser htmlParser = new HtmlParser();
+        ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
+        ParseContext parser = new ParseContext();
+        try {
+            stream = new ByteArrayInputStream(byteObject);
+            htmlParser.parse(stream, handler, metadata, parser);
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        } catch (IOException e) {
+            // Pushback overflow from tagsoup
+        }
+    }
+    
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * 
+ * Defines an accessor interface
+ * 
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+    /**
+     * Parses chm accessor
+     * 
+     * @param data
+     *            chm file
+     * @param chmAccessor
+     * @throws TikaException 
+     */
+    void parse(byte[] data, T chmAccessor) throws TikaException;
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,398 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+    private List<DirectoryListingEntry> dlel;
+    private byte[] data;
+    private int placeHolder = -1;
+    private long dataOffset = -1;
+    private int controlDataIndex = -1;
+    private int resetTableIndex = -1;
+
+    private boolean isNotControlDataFound = true;
+    private boolean isNotResetTableFound = true;
+
+    /**
+     * Constructs chm directory listing set
+     * 
+     * @param data
+     *            byte[]
+     * @param chmItsHeader
+     * @param chmItspHeader
+     * @throws TikaException 
+     */
+    public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) throws TikaException {
+        setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+        ChmCommons.assertByteArrayNotNull(data);
+        setData(data);
+        enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("list:=" + getDirectoryListingEntryList().toString()
+                + System.getProperty("line.separator"));
+        sb.append("number of list items:="
+                + getDirectoryListingEntryList().size());
+        return sb.toString();
+    }
+
+    /**
+     * Returns control data index that located in List
+     * 
+     * @return control data index
+     */
+    public int getControlDataIndex() {
+        return controlDataIndex;
+    }
+
+    /**
+     * Sets control data index
+     * 
+     * @param controlDataIndex
+     */
+    protected void setControlDataIndex(int controlDataIndex) {
+        this.controlDataIndex = controlDataIndex;
+    }
+
+    /**
+     * Return index of reset table
+     * 
+     * @return reset table index
+     */
+    public int getResetTableIndex() {
+        return resetTableIndex;
+    }
+
+    /**
+     * Sets reset table index
+     * 
+     * @param resetTableIndex
+     */
+    protected void setResetTableIndex(int resetTableIndex) {
+        this.resetTableIndex = resetTableIndex;
+    }
+
+    /**
+     * Sets place holder
+     * 
+     * @param placeHolder
+     */
+    private void setPlaceHolder(int placeHolder) {
+        this.placeHolder = placeHolder;
+    }
+
+    private ChmPmglHeader PMGLheader;
+    /**
+     * Enumerates chm directory listing entries
+     * 
+     * @param chmItsHeader
+     *            chm itsf PMGLheader
+     * @param chmItspHeader
+     *            chm itsp PMGLheader
+     */
+    private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) {
+        try {
+            int startPmgl = chmItspHeader.getIndex_head();
+            int stopPmgl = chmItspHeader.getUnknown_0024();
+            int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+                    .getHeader_len());
+            setDataOffset(chmItsHeader.getDataOffset());
+
+            /* loops over all pmgls */
+            byte[] dir_chunk = null;
+            for (int i = startPmgl; i>=0; ) {
+                dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+                int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
+                dir_chunk = ChmCommons
+                        .copyOfRange(getData(), start,
+                                start +(int) chmItspHeader.getBlock_len());
+
+                PMGLheader = new ChmPmglHeader();
+                PMGLheader.parse(dir_chunk, PMGLheader);
+                enumerateOneSegment(dir_chunk);
+                
+                i=PMGLheader.getBlockNext();
+                dir_chunk = null;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            setData(null);
+        }
+    }
+
+    /**
+     * Checks control data
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkControlData(DirectoryListingEntry dle) {
+        if (isNotControlDataFound) {
+            if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+                setControlDataIndex(getDirectoryListingEntryList().size());
+                isNotControlDataFound = false;
+            }
+        }
+    }
+
+    /**
+     * Checks reset table
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkResetTable(DirectoryListingEntry dle) {
+        if (isNotResetTableFound) {
+            if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+                setResetTableIndex(getDirectoryListingEntryList().size());
+                isNotResetTableFound = false;
+            }
+        }
+    }
+
+    public static final boolean startsWith(byte[] data, String prefix) {
+        for (int i=0; i<prefix.length(); i++) {
+            if (data[i]!=prefix.charAt(i)) {
+                return false;
+            }
+        }
+        
+        return true;
+    }
+    /**
+     * Enumerates chm directory listing entries in single chm segment
+     * 
+     * @param dir_chunk
+     */
+    private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+//        try {
+            if (dir_chunk != null) {
+                int header_len;
+                if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
+                    header_len = ChmConstants.CHM_PMGI_LEN;
+                    return; //skip PMGI
+                }
+                else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
+                    header_len = ChmConstants.CHM_PMGL_LEN;
+                }
+                else {
+                    throw new ChmParsingException("Bad dir entry block.");
+                }
+
+                placeHolder = header_len;
+                //setPlaceHolder(header_len);
+                while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+                        /*&& dir_chunk[placeHolder - 1] != 115*/) 
+                {
+                    //get entry name length
+                    int strlen = 0;// = getEncint(data);
+                    byte temp;
+                    while ((temp=dir_chunk[placeHolder++]) >= 0x80)
+                    {
+                        strlen <<= 7;
+                        strlen += temp & 0x7f;
+                    }
+
+                    strlen = (strlen << 7) + temp & 0x7f;
+                    
+                    if (strlen>dir_chunk.length) {
+                        throw new ChmParsingException("Bad data of a string length.");
+                    }
+                    
+                    DirectoryListingEntry dle = new DirectoryListingEntry();
+                    dle.setNameLength(strlen);
+                    dle.setName(new String(ChmCommons.copyOfRange(
+                                dir_chunk, placeHolder,
+                                (placeHolder + dle.getNameLength())), UTF_8));
+
+                    checkControlData(dle);
+                    checkResetTable(dle);
+                    setPlaceHolder(placeHolder
+                            + dle.getNameLength());
+
+                    /* Sets entry type */
+                    if (placeHolder < dir_chunk.length
+                            && dir_chunk[placeHolder] == 0)
+                        dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+                    else
+                        dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+                    setPlaceHolder(placeHolder + 1);
+                    dle.setOffset(getEncint(dir_chunk));
+                    dle.setLength(getEncint(dir_chunk));
+                    getDirectoryListingEntryList().add(dle);
+                }
+                
+//                int indexWorkData = ChmCommons.indexOf(dir_chunk,
+//                        "::".getBytes(UTF_8));
+//                int indexUserData = ChmCommons.indexOf(dir_chunk,
+//                        "/".getBytes(UTF_8));
+//
+//                if (indexUserData>=0 && indexUserData < indexWorkData)
+//                    setPlaceHolder(indexUserData);
+//                else if (indexWorkData>=0) {
+//                    setPlaceHolder(indexWorkData);
+//                }
+//                else {
+//                    setPlaceHolder(indexUserData);
+//                }
+//
+//                if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+//                        && dir_chunk[placeHolder - 1] != 115) {// #{
+//                    do {
+//                        if (dir_chunk[placeHolder - 1] > 0) {
+//                            DirectoryListingEntry dle = new DirectoryListingEntry();
+//
+//                            // two cases: 1. when dir_chunk[placeHolder -
+//                            // 1] == 0x73
+//                            // 2. when dir_chunk[placeHolder + 1] == 0x2f
+//                            doNameCheck(dir_chunk, dle);
+//
+//                            // dle.setName(new
+//                            // String(Arrays.copyOfRange(dir_chunk,
+//                            // placeHolder, (placeHolder +
+//                            // dle.getNameLength()))));
+//                            dle.setName(new String(ChmCommons.copyOfRange(
+//                                    dir_chunk, placeHolder,
+//                                    (placeHolder + dle.getNameLength())), UTF_8));
+//                            checkControlData(dle);
+//                            checkResetTable(dle);
+//                            setPlaceHolder(placeHolder
+//                                    + dle.getNameLength());
+//
+//                            /* Sets entry type */
+//                            if (placeHolder < dir_chunk.length
+//                                    && dir_chunk[placeHolder] == 0)
+//                                dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+//                            else
+//                                dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+//
+//                            setPlaceHolder(placeHolder + 1);
+//                            dle.setOffset(getEncint(dir_chunk));
+//                            dle.setLength(getEncint(dir_chunk));
+//                            getDirectoryListingEntryList().add(dle);
+//                        } else
+//                            setPlaceHolder(placeHolder + 1);
+//
+//                    } while (nextEntry(dir_chunk));
+//                }
+            }
+
+//        } catch (Exception e) {
+//            e.printStackTrace();
+//        }
+    }
+
+
+    /**
+     * Returns encrypted integer
+     * 
+     * @param data_chunk
+     * 
+     * @return
+     */
+    private int getEncint(byte[] data_chunk) {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+
+        if (placeHolder < data_chunk.length) {
+            while ((ob = data_chunk[placeHolder]) < 0) {
+                nb[0] = (byte) ((ob & 0x7f));
+                bi = bi.shiftLeft(7).add(new BigInteger(nb));
+                setPlaceHolder(placeHolder + 1);
+            }
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+            setPlaceHolder(placeHolder + 1);
+        }
+        return bi.intValue();
+    }
+
+    /**
+     * Sets chm directory listing entry list
+     * 
+     * @param dlel
+     *            chm directory listing entry list
+     */
+    public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+        this.dlel = dlel;
+    }
+
+    /**
+     * Returns chm directory listing entry list
+     * 
+     * @return List<DirectoryListingEntry>
+     */
+    public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+        return dlel;
+    }
+
+    /**
+     * Sets data
+     * 
+     * @param data
+     */
+    private void setData(byte[] data) {
+        this.data = data;
+    }
+
+    /**
+     * Returns data
+     * 
+     * @return
+     */
+    private byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Sets data offset
+     * 
+     * @param dataOffset
+     */
+    private void setDataOffset(long dataOffset) {
+        this.dataOffset = dataOffset;
+    }
+
+    /**
+     * Returns data offset
+     * 
+     * @return dataOffset
+     */
+    public long getDataOffset() {
+        return dataOffset;
+    }
+}