You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [5/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ tik...
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016
@@ -0,0 +1,14 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp)
+ are in the public domain. These files were retrieved from:
+ https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp
+ These photos are also available here:
+ https://developers.google.com/speed/webp/gallery2#webp_links
+ Credits for the photo:
+ "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers"
+ Image Author: Jon Sullivan
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.fontbox.afm.AFMParser;
+import org.apache.fontbox.afm.FontMetric;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for AFM Font Files
+ */
+public class AdobeFontMetricParser extends AbstractParser {
+ /** Serial version UID */
+ private static final long serialVersionUID = -4820306522217196835L;
+
+ private static final MediaType AFM_TYPE =
+ MediaType.application( "x-font-adobe-metric" );
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(AFM_TYPE);
+
+ // TIKA-1325 Replace these with properties, from a well known standard
+ static final String MET_AVG_CHAR_WIDTH = "AvgCharacterWidth";
+ static final String MET_DOC_VERSION = "DocVersion";
+ static final String MET_PS_NAME = "PSName";
+ static final String MET_FONT_NAME = "FontName";
+ static final String MET_FONT_FULL_NAME = "FontFullName";
+ static final String MET_FONT_FAMILY_NAME = "FontFamilyName";
+ static final String MET_FONT_SUB_FAMILY_NAME = "FontSubFamilyName";
+ static final String MET_FONT_VERSION = "FontVersion";
+ static final String MET_FONT_WEIGHT = "FontWeight";
+ static final String MET_FONT_NOTICE = "FontNotice";
+ static final String MET_FONT_UNDERLINE_THICKNESS = "FontUnderlineThickness";
+
+ public Set<MediaType> getSupportedTypes( ParseContext context ) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ FontMetric fontMetrics;
+ AFMParser parser = new AFMParser( stream );
+
+ // Have FontBox process the file
+ parser.parse();
+ fontMetrics = parser.getResult();
+
+ // Get the comments in the file to display in xhtml
+ List<String> comments = fontMetrics.getComments();
+
+ // Get the creation date
+ extractCreationDate( metadata, comments );
+
+ metadata.set( Metadata.CONTENT_TYPE, AFM_TYPE.toString() );
+ metadata.set( TikaCoreProperties.TITLE, fontMetrics.getFullName() );
+
+ // Add metadata associated with the font type
+ addMetadataByString( metadata, MET_AVG_CHAR_WIDTH, Float.toString( fontMetrics.getAverageCharacterWidth() ) );
+ addMetadataByString( metadata, MET_DOC_VERSION, Float.toString( fontMetrics.getAFMVersion() ) );
+ addMetadataByString( metadata, MET_FONT_NAME, fontMetrics.getFontName() );
+ addMetadataByString( metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName() );
+ addMetadataByString( metadata, MET_FONT_FAMILY_NAME, fontMetrics.getFamilyName() );
+ addMetadataByString( metadata, MET_FONT_VERSION, fontMetrics.getFontVersion() );
+ addMetadataByString( metadata, MET_FONT_WEIGHT, fontMetrics.getWeight() );
+ addMetadataByString( metadata, MET_FONT_NOTICE, fontMetrics.getNotice() );
+ addMetadataByString( metadata, MET_FONT_UNDERLINE_THICKNESS, Float.toString( fontMetrics.getUnderlineThickness() ) );
+
+ // Output the remaining comments as text
+ XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata );
+ xhtml.startDocument();
+
+ // Display the comments
+ if (comments.size() > 0) {
+ xhtml.element( "h1", "Comments" );
+ xhtml.startElement("div", "class", "comments");
+ for (String comment : comments) {
+ xhtml.element( "p", comment );
+ }
+ xhtml.endElement("div");
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void addMetadataByString( Metadata metadata, String name, String value ) {
+ // Add metadata if an appropriate value is passed
+ if (value != null) {
+ metadata.add( name, value );
+ }
+ }
+
+ private void addMetadataByProperty( Metadata metadata, Property property, String value ) {
+ // Add metadata if an appropriate value is passed
+ if (value != null)
+ {
+ metadata.set( property, value );
+ }
+ }
+
+
+ private void extractCreationDate( Metadata metadata, List<String> comments ) {
+ String date = null;
+
+ for (String value : comments) {
+ // Look for the creation date
+ if( value.matches( ".*Creation\\sDate.*" ) ) {
+ date = value.substring( value.indexOf( ":" ) + 2 );
+ comments.remove( value );
+
+ break;
+ }
+ }
+
+ // If appropriate date then store as metadata
+ if( date != null ) {
+ addMetadataByProperty( metadata, Metadata.CREATION_DATE, date );
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.NameRecord;
+import org.apache.fontbox.ttf.NamingTable;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for TrueType font files (TTF).
+ */
+public class TrueTypeParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 44788554612243032L;
+
+ private static final MediaType TYPE =
+ MediaType.application("x-font-ttf");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TikaInputStream tis = TikaInputStream.cast(stream);
+
+ // Ask FontBox to parse the file for us
+ TrueTypeFont font;
+ TTFParser parser = new TTFParser();
+ if (tis != null && tis.hasFile()) {
+ font = parser.parseTTF(tis.getFile());
+ } else {
+ font = parser.parseTTF(stream);
+ }
+
+ // Report the details of the font
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ metadata.set(TikaCoreProperties.CREATED,
+ font.getHeader().getCreated());
+ metadata.set(TikaCoreProperties.MODIFIED,
+ font.getHeader().getModified());
+ metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
+ Float.toString(font.getHeader().getVersion()));
+
+ // Pull out the naming info
+ NamingTable fontNaming = font.getNaming();
+ for (NameRecord nr : fontNaming.getNameRecords()) {
+ if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
+ metadata.set(TikaCoreProperties.TITLE, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
+ metadata.set("Copyright", nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
+ metadata.set("Trademark", nr.getString());
+ }
+ }
+
+ // For now, we only output metadata, no textual contents
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Configuration for TesseractOCRParser.
+ *
+ * This allows to enable TesseractOCRParser and set its parameters:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ *
+ * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
+ * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
+ * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ *
+ */
+public class TesseractOCRConfig implements Serializable{
+
+ private static final long serialVersionUID = -4861942486845757891L;
+
+ // Path to tesseract installation folder, if not on system path.
+ private String tesseractPath = "";
+
+ // Path to the 'tessdata' folder, which contains language files and config files.
+ private String tessdataPath = "";
+
+ // Language dictionary to be used.
+ private String language = "eng";
+
+ // Tesseract page segmentation mode.
+ private String pageSegMode = "1";
+
+ // Minimum file size to submit file to ocr.
+ private int minFileSizeToOcr = 0;
+
+ // Maximum file size to submit file to ocr.
+ private int maxFileSizeToOcr = Integer.MAX_VALUE;
+
+ // Maximum time (seconds) to wait for the ocring process termination
+ private int timeout = 120;
+
+ /**
+ * Default contructor.
+ */
+ public TesseractOCRConfig() {
+ init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close InputStream.
+ * If there is an IOException, this silently swallows the exception
+ * and goes back to the default.
+ *
+ * @param is
+ */
+ public TesseractOCRConfig(InputStream is) {
+ init(is);
+ }
+
+ private void init(InputStream is) {
+ if (is == null) {
+ return;
+ }
+ Properties props = new Properties();
+ try {
+ props.load(is);
+ } catch (IOException e) {
+ } finally {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+
+ setTesseractPath(
+ getProp(props, "tesseractPath", getTesseractPath()));
+ setTessdataPath(
+ getProp(props, "tessdataPath", getTessdataPath()));
+ setLanguage(
+ getProp(props, "language", getLanguage()));
+ setPageSegMode(
+ getProp(props, "pageSegMode", getPageSegMode()));
+ setMinFileSizeToOcr(
+ getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
+ setMaxFileSizeToOcr(
+ getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
+ setTimeout(
+ getProp(props, "timeout", getTimeout()));
+
+ }
+
+ /** @see #setTesseractPath(String tesseractPath)*/
+ public String getTesseractPath() {
+ return tesseractPath;
+ }
+
+ /**
+ * Set the path to the Tesseract executable, needed if it is not on system path.
+ * <p>
+ * Note that if you set this value, it is highly recommended that you also
+ * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+ * </p>
+ */
+ public void setTesseractPath(String tesseractPath) {
+ if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
+ tesseractPath += File.separator;
+
+ this.tesseractPath = tesseractPath;
+ }
+
+ /** @see #setTessdataPath(String tessdataPath) */
+ public String getTessdataPath() {
+ return tessdataPath;
+ }
+
+ /**
+ * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+ * as on Windows), this folder is found in the Tesseract installation, but in other cases
+ * (such as when Tesseract is built from source), it may be located elsewhere.
+ */
+ public void setTessdataPath(String tessdataPath) {
+ if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+ tessdataPath += File.separator;
+
+ this.tessdataPath = tessdataPath;
+ }
+
+ /** @see #setLanguage(String language)*/
+ public String getLanguage() {
+ return language;
+ }
+
+ /**
+ * Set tesseract language dictionary to be used. Default is "eng".
+ * Multiple languages may be specified, separated by plus characters.
+ */
+ public void setLanguage(String language) {
+ if (!language.matches("([A-Za-z](\\+?))*")) {
+ throw new IllegalArgumentException("Invalid language code");
+ }
+ this.language = language;
+ }
+
+ /** @see #setPageSegMode(String pageSegMode)*/
+ public String getPageSegMode() {
+ return pageSegMode;
+ }
+
+ /**
+ * Set tesseract page segmentation mode.
+ * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+ */
+ public void setPageSegMode(String pageSegMode) {
+ if (!pageSegMode.matches("[1-9]|10")) {
+ throw new IllegalArgumentException("Invalid language code");
+ }
+ this.pageSegMode = pageSegMode;
+ }
+
+ /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+ public int getMinFileSizeToOcr() {
+ return minFileSizeToOcr;
+ }
+
+ /**
+ * Set minimum file size to submit file to ocr.
+ * Default is 0.
+ */
+ public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+ this.minFileSizeToOcr = minFileSizeToOcr;
+ }
+
+ /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+ public int getMaxFileSizeToOcr() {
+ return maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum file size to submit file to ocr.
+ * Default is Integer.MAX_VALUE.
+ */
+ public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+ this.maxFileSizeToOcr = maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum time (seconds) to wait for the ocring process to terminate.
+ * Default value is 120s.
+ */
+ public void setTimeout(int timeout) {
+ this.timeout = timeout;
+ }
+
+ /** @see #setTimeout(int timeout)*/
+ public int getTimeout() {
+ return timeout;
+ }
+
+ /**
+ * Get property from the properties file passed in.
+ * @param properties properties file to read from.
+ * @param property the property to fetch.
+ * @param defaultMissing default parameter to use.
+ * @return the value.
+ */
+ private int getProp(Properties properties, String property, int defaultMissing) {
+ String p = properties.getProperty(property);
+ if (p == null || p.isEmpty()){
+ return defaultMissing;
+ }
+ try {
+ return Integer.parseInt(p);
+ } catch (Throwable ex) {
+ throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
+ property), ex);
+ }
+ }
+
+ /**
+ * Get property from the properties file passed in.
+ * @param properties properties file to read from.
+ * @param property the property to fetch.
+ * @param defaultMissing default parameter to use.
+ * @return the value.
+ */
+ private String getProp(Properties properties, String property, String defaultMissing) {
+ return properties.getProperty(property, defaultMissing);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import javax.imageio.ImageIO;
+
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
+ * create a {@link TesseractOCRConfig} object and pass it through a
+ * ParseContext. Tesseract-ocr must be installed and on system path or the path
+ * to its root folder must be provided:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * //Needed if tesseract is not on system path<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ *
+ *
+ */
+public class TesseractOCRParser extends AbstractParser {
+ private static final long serialVersionUID = -8167538283213097265L;
+ private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+ MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
+ MediaType.image("x-ms-bmp"), MediaType.image("gif")
+ })));
+ private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // If Tesseract is installed, offer our supported image types
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+ if (hasTesseract(config))
+ return SUPPORTED_TYPES;
+
+ // Otherwise don't advertise anything, so the other image parsers
+ // can be selected instead
+ return Collections.emptySet();
+ }
+
+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+ String tessdataPrefix = "TESSDATA_PREFIX";
+ Map<String, String> env = pb.environment();
+
+ if (!config.getTessdataPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTessdataPath());
+ }
+ else if(!config.getTesseractPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTesseractPath());
+ }
+ }
+
+ private boolean hasTesseract(TesseractOCRConfig config) {
+ // Fetch where the config says to find Tesseract
+ String tesseract = config.getTesseractPath() + getTesseractProg();
+
+ // Have we already checked for a copy of Tesseract there?
+ if (TESSERACT_PRESENT.containsKey(tesseract)) {
+ return TESSERACT_PRESENT.get(tesseract);
+ }
+
+ // Try running Tesseract from there, and see if it exists + works
+ String[] checkCmd = { tesseract };
+ boolean hasTesseract = ExternalParser.check(checkCmd);
+ TESSERACT_PRESENT.put(tesseract, hasTesseract);
+ return hasTesseract;
+
+ }
+
+ public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ TemporaryResources tmp = new TemporaryResources();
+ FileOutputStream fos = null;
+ TikaInputStream tis = null;
+ try {
+ int w = image.getWidth(null);
+ int h = image.getHeight(null);
+ BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+ File file = tmp.createTemporaryFile();
+ fos = new FileOutputStream(file);
+ ImageIO.write(bImage, "png", fos);
+ tis = TikaInputStream.get(file);
+ parse(tis, handler, metadata, context);
+
+ } finally {
+ tmp.dispose();
+ if (tis != null)
+ tis.close();
+ if (fos != null)
+ fos.close();
+ }
+
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+ // If Tesseract is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+ if (! hasTesseract(config))
+ return;
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ TemporaryResources tmp = new TemporaryResources();
+ File output = null;
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ File input = tikaStream.getFile();
+ long size = tikaStream.getLength();
+
+ if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+ output = tmp.createTemporaryFile();
+ doOCR(input, output, config);
+
+ // Tesseract appends .txt to output file name
+ output = new File(output.getAbsolutePath() + ".txt");
+
+ if (output.exists())
+ extractOutput(new FileInputStream(output), xhtml);
+
+ }
+
+ // Temporary workaround for TIKA-1445 - until we can specify
+ // composite parsers with strategies (eg Composite, Try In Turn),
+ // always send the image onwards to the regular parser to have
+ // the metadata for them extracted as well
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
+ } finally {
+ tmp.dispose();
+ if (output != null) {
+ output.delete();
+ }
+ }
+ }
+ // TIKA-1445 workaround parser
+ private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
+ private static class CompositeImageParser extends CompositeParser {
+ private static final long serialVersionUID = -2398203346206381382L;
+ private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
+ new ImageParser(), new JpegParser(), new TiffParser()
+ });
+ CompositeImageParser() {
+ super(new MediaTypeRegistry(), imageParsers);
+ }
+ }
+
+ /**
+ * Run external tesseract-ocr process.
+ *
+ * @param input
+ * File to be ocred
+ * @param output
+ * File to collect ocr result
+ * @param config
+ * Configuration of tesseract-ocr engine
+ * @throws TikaException
+ * if the extraction timed out
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+ String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+ config.getLanguage(), "-psm", config.getPageSegMode() };
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ setEnv(config, pb);
+ final Process process = pb.start();
+
+ process.getOutputStream().close();
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ logStream("OCR MSG", out, input);
+ logStream("OCR ERROR", err, input);
+
+ FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
+ try {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+ } catch (InterruptedException e) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException("TesseractOCRParser interrupted", e);
+
+ } catch (ExecutionException e) {
+ // should not be thrown
+
+ } catch (TimeoutException e) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException("TesseractOCRParser timeout", e);
+ }
+
+ }
+
+ /**
+ * Reads the contents of the given stream and write it to the given XHTML
+ * content handler. The stream is closed once fully processed.
+ *
+ * @param stream
+ * Stream where is the result of ocr
+ * @param xhtml
+ * XHTML content handler
+ * @throws SAXException
+ * if the XHTML SAX events could not be handled
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+ xhtml.startDocument();
+ xhtml.startElement("div");
+ try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ if (n > 0)
+ xhtml.characters(buffer, 0, n);
+ }
+ }
+ xhtml.endElement("div");
+ xhtml.endDocument();
+ }
+
+ /**
+ * Starts a thread that reads the contents of the standard output or error
+ * stream of the given process to not block the process. The stream is closed
+ * once fully processed.
+ */
+ private void logStream(final String logType, final InputStream stream, final File file) {
+ new Thread() {
+ public void run() {
+ Reader reader = new InputStreamReader(stream, UTF_8);
+ StringBuilder out = new StringBuilder();
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+ out.append(buffer, 0, n);
+ } catch (IOException e) {
+
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+ String msg = out.toString();
+ LogFactory.getLog(TesseractOCRParser.class).debug(msg);
+ }
+ }.start();
+ }
+
+ static String getTesseractProg() {
+ return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
+ }
+
+}
Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&r1=1723222&r2=1723223&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -14,6 +14,8 @@
# limitations under the License.
+org.apache.tika.parser.font.AdobeFontMetricParser
+org.apache.tika.parser.font.TrueTypeParser
org.apache.tika.parser.image.BPGParser
org.apache.tika.parser.image.ImageParser
org.apache.tika.parser.image.PSDParser
@@ -22,6 +24,7 @@ org.apache.tika.parser.image.WebPParser
org.apache.tika.parser.jpeg.JpegParser
org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
+org.apache.tika.parser.ocr.TesseractOCRParser
org.apache.tika.parser.mp3.Mp3Parser
org.apache.tika.parser.mp4.MP4Parser
org.apache.tika.parser.video.FLVParser
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties Wed Jan 6 03:50:50 2016
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tesseractPath=
+language=eng
+pageSegMode=1
+maxFileSizeToOcr=2147483647
+minFileSizeToOcr=0
+timeout=120
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FAMILY_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FULL_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_VERSION;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_WEIGHT;
+import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_PS_NAME;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing various different font files.
+ */
+public class FontParsersTest {
+ @Test
+ public void testAdobeFontMetricParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ try (TikaInputStream stream = TikaInputStream.get(
+ FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
+
+ assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
+ assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
+ assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));
+
+ assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
+ assertEquals("001.008", metadata.get(MET_FONT_VERSION));
+
+ String content = handler.toString();
+
+ // Test that the comments got extracted
+ assertContains("Comments", content);
+ assertContains("This is a comment in a sample file", content);
+ assertContains("UniqueID 12345", content);
+ }
+
+ @Test
+ public void testTTFParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ //Open Sans font is ASL 2.0 according to
+ //http://www.google.com/fonts/specimen/Open+Sans
+ //...despite the copyright in the file's metadata.
+
+ try (TikaInputStream stream = TikaInputStream.get(
+ FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
+
+ assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
+
+ assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
+ assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
+ assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
+ assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
+
+ assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
+ assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
+
+ // Not extracted
+ assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
+ assertEquals(null, metadata.get(MET_FONT_WEIGHT));
+ assertEquals(null, metadata.get(MET_FONT_VERSION));
+
+ // Currently, the parser doesn't extract any contents
+ String content = handler.toString();
+ assertEquals("", content);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class TesseractOCRConfigTest extends TikaTest {
+
+ @Test
+ public void testNoConfig() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+ assertEquals("Invalid default language value", "eng", config.getLanguage());
+ assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+ assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
+ assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid default timeout value", 120, config.getTimeout());
+ }
+
+ @Test
+ public void testPartialConfig() throws Exception {
+
+ InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+ "/test-properties/TesseractOCRConfig-partial.properties");
+
+ TesseractOCRConfig config = new TesseractOCRConfig(stream);
+ assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+ assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+ assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+ assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+ assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+ }
+
+ @Test
+ public void testFullConfig() throws Exception {
+
+ InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+ "/test-properties/TesseractOCRConfig-full.properties");
+
+ TesseractOCRConfig config = new TesseractOCRConfig(stream);
+ assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+ assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
+ assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+ assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
+ assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+ assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidateLanguage() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setLanguage("eng");
+ config.setLanguage("eng+fra");
+ assertTrue("Couldn't set valid values", true);
+ config.setLanguage("rm -Rf *");
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidatePageSegMode() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setPageSegMode("0");
+ config.setPageSegMode("10");
+ assertTrue("Couldn't set valid values", true);
+ config.setPageSegMode("11");
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.mail.RFC822Parser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TesseractOCRParserTest extends TikaTest {
+
+ public static boolean canRun() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
+ return tesseractOCRTest.canRun(config);
+ }
+
+ private boolean canRun(TesseractOCRConfig config) {
+ String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
+ // If Tesseract is not on the path, do not run the test.
+ return ExternalParser.check(checkCmd);
+ }
+
+ /*
+ Check that if Tesseract is not found, the TesseractOCRParser claims to not support
+ any file types. So, the standard image parser is called instead.
+ */
+ @Test
+ public void offersNoTypesIfNotFound() throws Exception {
+ TesseractOCRParser parser = new TesseractOCRParser();
+ DefaultParser defaultParser = new DefaultParser();
+ MediaType png = MediaType.image("png");
+
+ // With an invalid path, will offer no types
+ TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
+ invalidConfig.setTesseractPath("/made/up/path");
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, invalidConfig);
+
+ // No types offered
+ assertEquals(0, parser.getSupportedTypes(parseContext).size());
+
+ // And DefaultParser won't use us
+ assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+ }
+
+ /*
+ If Tesseract is found, test we retrieve the proper number of supporting Parsers.
+ */
+ @Test
+ public void offersTypesIfFound() throws Exception {
+ TesseractOCRParser parser = new TesseractOCRParser();
+ DefaultParser defaultParser = new DefaultParser();
+
+ ParseContext parseContext = new ParseContext();
+ MediaType png = MediaType.image("png");
+
+ // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
+ assumeTrue(canRun());
+
+ assertEquals(5, parser.getSupportedTypes(parseContext).size());
+ assertTrue(parser.getSupportedTypes(parseContext).contains(png));
+
+ // DefaultParser will now select the TesseractOCRParser.
+ assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+ }
+
+ @Test
+ public void testPDFOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pdf";
+ String[] nonOCRContains = new String[0];
+ testBasicOCR(resource, nonOCRContains, 2);
+ }
+
+ @Test
+ public void testDOCXOCR() throws Exception {
+ String resource = "/test-documents/testOCR.docx";
+ String[] nonOCRContains = {
+ "This is some text.",
+ "Here is an embedded image:"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
+ }
+
+ @Test
+ public void testPPTXOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pptx";
+ String[] nonOCRContains = {
+ "This is some text"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
+ }
+
+ private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+ PDFParserConfig pdfConfig = new PDFParserConfig();
+ pdfConfig.setExtractInlineImages(true);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, parser);
+ parseContext.set(PDFParserConfig.class, pdfConfig);
+
+ try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
+ parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
+ }
+ List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
+ assertEquals(numMetadatas, metadataList.size());
+
+ StringBuilder contents = new StringBuilder();
+ for (Metadata m : metadataList) {
+ contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+ if (canRun()) {
+ assertTrue(contents.toString().contains("Happy New Year 2003!"));
+ }
+ for (String needle : nonOCRContains) {
+ assertContains(needle, contents.toString());
+ }
+ assertTrue(metadataList.get(0).names().length > 10);
+ assertTrue(metadataList.get(1).names().length > 10);
+ //test at least one value
+ assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+ }
+
+ @Test
+ public void testSingleImage() throws Exception {
+ assumeTrue(canRun());
+ String xml = getXML("testOCR.jpg").xml;
+ assertContains("OCR Testing", xml);
+ }
+
+ @Test
+ public void getNormalMetadataToo() throws Exception {
+ //this should be successful whether or not TesseractOCR is installed/active
+ //If tesseract is installed, the internal metadata extraction parser should
+ //work; and if tesseract isn't installed, the regular parsers should take over.
+
+ //gif
+ Metadata m = getXML("testGIF.gif").metadata;
+ assertTrue(m.names().length > 20);
+ assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+ //jpg
+ m = getXML("testOCR.jpg").metadata;
+ assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+ assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
+
+ //bmp
+ m = getXML("testBMP.bmp").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+ //png
+ m = getXML("testPNG.png").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+ //tiff
+ m = getXML("testTIFF.tif").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("72 dots per inch", m.get("Y Resolution"));
+ }
+
+ @Test
+ public void testMultipart() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822-multipart");
+ ContentHandler handler = mock(XHTMLContentHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ verify(handler).startDocument();
+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+ // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked.
+ // But, different versions of Tesseract lead to a different number of invocations. So, we
+ // only verify the handler if Tesseract cannot run.
+ if (!TesseractOCRParserTest.canRun()) {
+ verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+ verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+ }
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+
+ //repeat, this time looking at content
+ parser = new RFC822Parser();
+ metadata = new Metadata();
+ stream = getStream("test-documents/testRFC822-multipart");
+ handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+ String bodyText = handler.toString();
+ assertTrue(bodyText.contains("body 1"));
+ assertTrue(bodyText.contains("body 2"));
+ assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ private static InputStream getStream(String name) {
+ InputStream stream = Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ assertNotNull("Test file not found " + name, stream);
+ return stream;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-office-module</artifactId>
+ <name>Apache Tika Office Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-scratchpad</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-ooxml</artifactId>
+ <version>${poi.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>stax</groupId>
+ <artifactId>stax-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.healthmarketscience.jackcess</groupId>
+ <artifactId>jackcess</artifactId>
+ <version>2.1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>com.healthmarketscience.jackcess</groupId>
+ <artifactId>jackcess-encrypt</artifactId>
+ <version>2.1.1</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-package-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-web-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 5938777307516469802L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-htmlhelp"),
+ MediaType.application("chm"),
+ MediaType.application("x-chm"))));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ ChmExtractor chmExtractor = new ChmExtractor(stream);
+
+ // metadata
+ metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
+
+ // content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
+ final String entryName = entry.getName();
+ if (entryName.endsWith(".html")
+ || entryName.endsWith(".htm")
+ ) {
+// AttributesImpl attrs = new AttributesImpl();
+// attrs.addAttribute("", "name", "name", "String", entryName);
+// xhtml.startElement("", "document", "document", attrs);
+
+ byte[] data = chmExtractor.extractChmEntry(entry);
+
+ parsePage(data, xhtml);
+
+// xhtml.endElement("", "", "document");
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+
+ private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
+ InputStream stream = null;
+ Metadata metadata = new Metadata();
+ HtmlParser htmlParser = new HtmlParser();
+ ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
+ ParseContext parser = new ParseContext();
+ try {
+ stream = new ByteArrayInputStream(byteObject);
+ htmlParser.parse(stream, handler, metadata, parser);
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ // Pushback overflow from tagsoup
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ *
+ * Defines an accessor interface
+ *
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+ /**
+ * Parses chm accessor
+ *
+ * @param data
+ * chm file
+ * @param chmAccessor
+ * @throws TikaException
+ */
+ void parse(byte[] data, T chmAccessor) throws TikaException;
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,398 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+ private List<DirectoryListingEntry> dlel;
+ private byte[] data;
+ private int placeHolder = -1;
+ private long dataOffset = -1;
+ private int controlDataIndex = -1;
+ private int resetTableIndex = -1;
+
+ private boolean isNotControlDataFound = true;
+ private boolean isNotResetTableFound = true;
+
+ /**
+ * Constructs chm directory listing set
+ *
+ * @param data
+ * byte[]
+ * @param chmItsHeader
+ * @param chmItspHeader
+ * @throws TikaException
+ */
+ public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) throws TikaException {
+ setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+ ChmCommons.assertByteArrayNotNull(data);
+ setData(data);
+ enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("list:=" + getDirectoryListingEntryList().toString()
+ + System.getProperty("line.separator"));
+ sb.append("number of list items:="
+ + getDirectoryListingEntryList().size());
+ return sb.toString();
+ }
+
+ /**
+ * Returns control data index that located in List
+ *
+ * @return control data index
+ */
+ public int getControlDataIndex() {
+ return controlDataIndex;
+ }
+
+ /**
+ * Sets control data index
+ *
+ * @param controlDataIndex
+ */
+ protected void setControlDataIndex(int controlDataIndex) {
+ this.controlDataIndex = controlDataIndex;
+ }
+
+ /**
+ * Return index of reset table
+ *
+ * @return reset table index
+ */
+ public int getResetTableIndex() {
+ return resetTableIndex;
+ }
+
+ /**
+ * Sets reset table index
+ *
+ * @param resetTableIndex
+ */
+ protected void setResetTableIndex(int resetTableIndex) {
+ this.resetTableIndex = resetTableIndex;
+ }
+
+ /**
+ * Sets place holder
+ *
+ * @param placeHolder
+ */
+ private void setPlaceHolder(int placeHolder) {
+ this.placeHolder = placeHolder;
+ }
+
+ private ChmPmglHeader PMGLheader;
+ /**
+ * Enumerates chm directory listing entries
+ *
+ * @param chmItsHeader
+ * chm itsf PMGLheader
+ * @param chmItspHeader
+ * chm itsp PMGLheader
+ */
+ private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) {
+ try {
+ int startPmgl = chmItspHeader.getIndex_head();
+ int stopPmgl = chmItspHeader.getUnknown_0024();
+ int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+ .getHeader_len());
+ setDataOffset(chmItsHeader.getDataOffset());
+
+ /* loops over all pmgls */
+ byte[] dir_chunk = null;
+ for (int i = startPmgl; i>=0; ) {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
+ dir_chunk = ChmCommons
+ .copyOfRange(getData(), start,
+ start +(int) chmItspHeader.getBlock_len());
+
+ PMGLheader = new ChmPmglHeader();
+ PMGLheader.parse(dir_chunk, PMGLheader);
+ enumerateOneSegment(dir_chunk);
+
+ i=PMGLheader.getBlockNext();
+ dir_chunk = null;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ setData(null);
+ }
+ }
+
+ /**
+ * Checks control data
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkControlData(DirectoryListingEntry dle) {
+ if (isNotControlDataFound) {
+ if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+ setControlDataIndex(getDirectoryListingEntryList().size());
+ isNotControlDataFound = false;
+ }
+ }
+ }
+
+ /**
+ * Checks reset table
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkResetTable(DirectoryListingEntry dle) {
+ if (isNotResetTableFound) {
+ if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+ setResetTableIndex(getDirectoryListingEntryList().size());
+ isNotResetTableFound = false;
+ }
+ }
+ }
+
+ public static final boolean startsWith(byte[] data, String prefix) {
+ for (int i=0; i<prefix.length(); i++) {
+ if (data[i]!=prefix.charAt(i)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ /**
+ * Enumerates chm directory listing entries in single chm segment
+ *
+ * @param dir_chunk
+ */
+ private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+// try {
+ if (dir_chunk != null) {
+ int header_len;
+ if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
+ header_len = ChmConstants.CHM_PMGI_LEN;
+ return; //skip PMGI
+ }
+ else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
+ header_len = ChmConstants.CHM_PMGL_LEN;
+ }
+ else {
+ throw new ChmParsingException("Bad dir entry block.");
+ }
+
+ placeHolder = header_len;
+ //setPlaceHolder(header_len);
+ while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+ /*&& dir_chunk[placeHolder - 1] != 115*/)
+ {
+ //get entry name length
+ int strlen = 0;// = getEncint(data);
+ byte temp;
+ while ((temp=dir_chunk[placeHolder++]) >= 0x80)
+ {
+ strlen <<= 7;
+ strlen += temp & 0x7f;
+ }
+
+ strlen = (strlen << 7) + temp & 0x7f;
+
+ if (strlen>dir_chunk.length) {
+ throw new ChmParsingException("Bad data of a string length.");
+ }
+
+ DirectoryListingEntry dle = new DirectoryListingEntry();
+ dle.setNameLength(strlen);
+ dle.setName(new String(ChmCommons.copyOfRange(
+ dir_chunk, placeHolder,
+ (placeHolder + dle.getNameLength())), UTF_8));
+
+ checkControlData(dle);
+ checkResetTable(dle);
+ setPlaceHolder(placeHolder
+ + dle.getNameLength());
+
+ /* Sets entry type */
+ if (placeHolder < dir_chunk.length
+ && dir_chunk[placeHolder] == 0)
+ dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+ else
+ dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+ setPlaceHolder(placeHolder + 1);
+ dle.setOffset(getEncint(dir_chunk));
+ dle.setLength(getEncint(dir_chunk));
+ getDirectoryListingEntryList().add(dle);
+ }
+
+// int indexWorkData = ChmCommons.indexOf(dir_chunk,
+// "::".getBytes(UTF_8));
+// int indexUserData = ChmCommons.indexOf(dir_chunk,
+// "/".getBytes(UTF_8));
+//
+// if (indexUserData>=0 && indexUserData < indexWorkData)
+// setPlaceHolder(indexUserData);
+// else if (indexWorkData>=0) {
+// setPlaceHolder(indexWorkData);
+// }
+// else {
+// setPlaceHolder(indexUserData);
+// }
+//
+// if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+// && dir_chunk[placeHolder - 1] != 115) {// #{
+// do {
+// if (dir_chunk[placeHolder - 1] > 0) {
+// DirectoryListingEntry dle = new DirectoryListingEntry();
+//
+// // two cases: 1. when dir_chunk[placeHolder -
+// // 1] == 0x73
+// // 2. when dir_chunk[placeHolder + 1] == 0x2f
+// doNameCheck(dir_chunk, dle);
+//
+// // dle.setName(new
+// // String(Arrays.copyOfRange(dir_chunk,
+// // placeHolder, (placeHolder +
+// // dle.getNameLength()))));
+// dle.setName(new String(ChmCommons.copyOfRange(
+// dir_chunk, placeHolder,
+// (placeHolder + dle.getNameLength())), UTF_8));
+// checkControlData(dle);
+// checkResetTable(dle);
+// setPlaceHolder(placeHolder
+// + dle.getNameLength());
+//
+// /* Sets entry type */
+// if (placeHolder < dir_chunk.length
+// && dir_chunk[placeHolder] == 0)
+// dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+// else
+// dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+//
+// setPlaceHolder(placeHolder + 1);
+// dle.setOffset(getEncint(dir_chunk));
+// dle.setLength(getEncint(dir_chunk));
+// getDirectoryListingEntryList().add(dle);
+// } else
+// setPlaceHolder(placeHolder + 1);
+//
+// } while (nextEntry(dir_chunk));
+// }
+ }
+
+// } catch (Exception e) {
+// e.printStackTrace();
+// }
+ }
+
+
+ /**
+ * Returns encrypted integer
+ *
+ * @param data_chunk
+ *
+ * @return
+ */
+ private int getEncint(byte[] data_chunk) {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+
+ if (placeHolder < data_chunk.length) {
+ while ((ob = data_chunk[placeHolder]) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(placeHolder + 1);
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(placeHolder + 1);
+ }
+ return bi.intValue();
+ }
+
+ /**
+ * Sets chm directory listing entry list
+ *
+ * @param dlel
+ * chm directory listing entry list
+ */
+ public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+ this.dlel = dlel;
+ }
+
+ /**
+ * Returns chm directory listing entry list
+ *
+ * @return List<DirectoryListingEntry>
+ */
+ public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+ return dlel;
+ }
+
+ /**
+ * Sets data
+ *
+ * @param data
+ */
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ /**
+ * Returns data
+ *
+ * @return
+ */
+ private byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param dataOffset
+ */
+ private void setDataOffset(long dataOffset) {
+ this.dataOffset = dataOffset;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return dataOffset
+ */
+ public long getDataOffset() {
+ return dataOffset;
+ }
+}