You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:20 UTC

[tika] branch branch_1x updated (f9910e2 -> 04225d2)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from f9910e2  update CHANGES.txt because of conflict in cherry-pick
     new d1526d0  Fix for TIKA-2582 contributed by ewanmellor.
     new b2ca378  Fix for TIKA-2584 contributed by ewanmellor.
     new 2efe3f9  Fix for TIKA-2613 contributed by ewanmellor.
     new 04225d2  TIKA-2621 -- add support for brotli

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/mime/tika-mimetypes.xml        |  14 ++++
 .../src/test/java/org/apache/tika/TikaTest.java    |  14 ++++
 tika-parsers/pom.xml                               |   7 ++
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |  64 ++++++++++++++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  18 ++++-
 .../apache/tika/parser/pkg/CompressorParser.java   |  82 +++++++++++++++++++--
 .../tika/parser/pkg/CompressorParserTest.java      |  22 +++++-
 .../test-documents/testBROTLI_compressed.br        | Bin 0 -> 12 bytes
 8 files changed, 208 insertions(+), 13 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 02/04: Fix for TIKA-2584 contributed by ewanmellor.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b2ca3781f7a27e7c0ca627359d13a66c56940039
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Wed Feb 21 13:43:44 2018 -0800

    Fix for TIKA-2584 contributed by ewanmellor.
    
    Add TesseractOCRConfig.{add,get}OtherTesseractConfig, plus parsing of
    TesseractOCRConfig.properties to extract any key-value pair where the
    key has an underscore.
    
    Inside TesseractOCRParser, pass these key-value pairs to Tesseract
    using its -c command line option.
    
    This gives a mechanism by which user code can pass arbitrary options
    to Tesseract without Tika having to understand them.
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 42 ++++++++++++++++++++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 15 ++++++--
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 4139cd2..07bb7f8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,9 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
+import java.util.HashMap;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Properties;
 
 /**
@@ -100,6 +102,9 @@ public class TesseractOCRConfig implements Serializable {
     // whether or not to apply rotation calculated by the rotation.py script
     private boolean applyRotation = false;
 
+    // See addOtherTesseractConfig.
+    private Map<String, String> otherTesseractConfig = new HashMap<>();
+
 
     /**
      * Default contructor.
@@ -178,6 +183,7 @@ public class TesseractOCRConfig implements Serializable {
         setApplyRotation(
         		getProp(props, "applyRotation", getApplyRotation()));
 
+        loadOtherTesseractConfig(props);
     }
 
     /**
@@ -517,6 +523,28 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * @see #addOtherTesseractConfig(String, String)
+     */
+    public Map<String, String> getOtherTesseractConfig() {
+        return otherTesseractConfig;
+    }
+
+    /**
+     * Add a key-value pair to pass to Tesseract using its -c command line option.
+     * To see the possible options, run tesseract --print-parameters.
+     *
+     * You may also add these parameters in TesseractOCRConfig.properties; any
+     * key-value pair in the properties file where the key contains an underscore
+     * is passed directly to Tesseract.
+     *
+     * @param key
+     * @param value
+     */
+    public void addOtherTesseractConfig(String key, String value) {
+        otherTesseractConfig.put(key, value);
+    }
+
+    /**
      * Get property from the properties file passed in.
      *
      * @param properties     properties file to read from.
@@ -565,4 +593,18 @@ public class TesseractOCRConfig implements Serializable {
                 property, propVal));
     }
 
+    /**
+     * Populate otherTesseractConfig from the given properties.
+     * This assumes that any key-value pair where the key contains
+     * an underscore is an option to be passed opaquely to Tesseract.
+     *
+     * @param properties properties file to read from.
+     */
+    private void loadOtherTesseractConfig(Properties properties) {
+        for (String k : properties.stringPropertyNames()) {
+            if (k.contains("_")) {
+                otherTesseractConfig.put(k, properties.getProperty(k));
+            }
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 3e15c44..6bf2ab4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -34,6 +34,7 @@ import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -465,12 +466,20 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
      *           if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+        ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
+                config.getTesseractPath() + getTesseractProg(), input.getPath(),  output.getPath(), "-l",
                 config.getLanguage(), "-psm", config.getPageSegMode(),
-                config.getOutputType().name().toLowerCase(Locale.US),
+                config.getOutputType().name().toLowerCase(Locale.US)
+        ));
+        for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
+            cmd.add("-c");
+            cmd.add(entry.getKey() + "=" + entry.getValue());
+        }
+        cmd.addAll(Arrays.asList(
                 "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
+                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"
+        ));
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
         final Process process = pb.start();

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 04/04: TIKA-2621 -- add support for brotli

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 04225d2834104c973e6cff421c283af876b2e398
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Mar 29 13:49:59 2018 -0400

    TIKA-2621 -- add support for brotli
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  14 ++++
 .../src/test/java/org/apache/tika/TikaTest.java    |  14 ++++
 tika-parsers/pom.xml                               |   7 ++
 .../apache/tika/parser/pkg/CompressorParser.java   |  82 +++++++++++++++++++--
 .../tika/parser/pkg/CompressorParserTest.java      |  22 +++++-
 .../test-documents/testBROTLI_compressed.br        | Bin 0 -> 12 bytes
 6 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 346eb73..634d9d1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3135,6 +3135,20 @@
       <match value="bplist" type="string" offset="0"/>
     </magic>
   </mime-type>
+  <mime-type type="application/x-gtar">
+    <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
+    <magic priority="50">
+      <!-- GNU tar archive -->
+      <match value="ustar  \0" type="string" offset="257" />
+    </magic>
+    <glob pattern="*.gtar"/>
+    <sub-class-of type="application/x-tar"/>
+  </mime-type>
+
+  <mime-type type="application/x-brotli">
+    <glob pattern="*.br" />
+    <glob pattern="*.brotli" />
+  </mime-type>
 
   <mime-type type="application/x-bzip">
     <magic priority="40">
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 153a564..9c827f7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -213,6 +213,20 @@ public abstract class TikaTest {
         return getRecursiveMetadata(filePath, new ParseContext());
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
+        return getRecursiveMetadata(filePath, new ParseContext(), metadata);
+    }
+
+    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
+        Parser p = new AutoDetectParser();
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+        try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+            wrapper.parse(is, new DefaultHandler(), metadata, context);
+        }
+        return wrapper.getMetadata();
+    }
+
     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
         Parser p = new AutoDetectParser();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a3e9e4d..e6c7720 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -40,6 +40,8 @@
     <codec.version>1.10</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
     <tukaani.version>1.8</tukaani.version>
+    <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
+    <brotli.version>0.1.2</brotli.version>
     <mime4j.version>0.8.1</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.9</pdfbox.version>
@@ -151,6 +153,11 @@
       <version>${tukaani.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.brotli</groupId>
+      <artifactId>dec</artifactId>
+      <version>${brotli.version}</version>
+    </dependency>
+    <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
       <version>1.3.3-3</version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index ada7ec9..658d04c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -21,6 +21,10 @@ import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.compress.MemoryLimitException;
@@ -78,9 +82,47 @@ public class CompressorParser extends AbstractParser {
     private static final MediaType ZSTD = MediaType.application("zstd");
     private static final MediaType DEFLATE64= MediaType.application("deflate64");
 
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
-                    XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
+    private static Set<MediaType> SUPPORTED_TYPES;
+    private static Map<String, String> MIMES_TO_NAME;
+
+    static {
+        Set<MediaType> TMP_SET = new HashSet<>();
+        TMP_SET.addAll(
+                MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
+                        XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA));
+        try {
+            Class.forName("org.brotli.dec.BrotliInputStream");
+            TMP_SET.add(BROTLI);
+        } catch (NoClassDefFoundError|ClassNotFoundException e) {
+            //swallow
+        }
+        try {
+            Class.forName("com.github.luben.zstd.ZstdInputStream");
+            TMP_SET.add(ZSTD);
+        } catch (NoClassDefFoundError|ClassNotFoundException e) {
+            //swallow
+        }
+        SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET);
+    }
+
+    static {
+        //map the mime type strings to the compressor stream names
+        Map<String, String> tmpMimesToName = new HashMap<>();
+        tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2);
+        tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP);
+        tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED);
+        tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK);
+        tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ);
+        tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200);
+        tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED);
+        tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE);
+        tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z);
+        tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA);
+        tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI);
+        tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD);
+        MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName);
+    }
+
 
     private int memoryLimitInKb = 100000;//100MB
 
@@ -181,7 +223,19 @@ public class CompressorParser extends AbstractParser {
                  });
             CompressorStreamFactory factory =
                     new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
-            cis = factory.createCompressorInputStream(stream);
+            //if we've already identified it via autodetect
+            //trust that and go with the appropriate name
+            //to avoid calling CompressorStreamFactory.detect() twice
+            String name = getStreamName(metadata);
+            if (name != null) {
+                cis = factory.createCompressorInputStream(name, stream);
+            } else {
+                cis = factory.createCompressorInputStream(stream);
+                MediaType type = getMediaType(cis);
+                if (!type.equals(MediaType.OCTET_STREAM)) {
+                    metadata.set(CONTENT_TYPE, type.toString());
+                }
+            }
         } catch (CompressorException e) {
             if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
                 throw new TikaMemoryLimitException(e.getMessage());
@@ -189,10 +243,6 @@ public class CompressorParser extends AbstractParser {
             throw new TikaException("Unable to uncompress document stream", e);
         }
 
-        MediaType type = getMediaType(cis);
-        if (!type.equals(MediaType.OCTET_STREAM)) {
-            metadata.set(CONTENT_TYPE, type.toString());
-        }
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
@@ -215,6 +265,8 @@ public class CompressorParser extends AbstractParser {
                     name = name.substring(0, name.length() - 5);
                 } else if (name.endsWith(".pack")) {
                     name = name.substring(0, name.length() - 5);
+                } else if (name.endsWith(".br")) {
+                    name = name.substring(0, name.length() - 3);
                 } else if (name.length() > 0) {
                     name = GzipUtils.getUncompressedFilename(name);
                 }
@@ -234,6 +286,20 @@ public class CompressorParser extends AbstractParser {
         xhtml.endDocument();
     }
 
+    /**
+     * @param metadata
+     * @return CompressorStream name based on the content-type value
+     * in metadata or <code>null</code> if not found
+     *  ind
+     */
+    private String getStreamName(Metadata metadata) {
+        String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+        if (mimeString == null) {
+            return null;
+        }
+        return MIMES_TO_NAME.get(mimeString);
+    }
+
     @Field
     public void setMemoryLimitInKb(int memoryLimitInKb) {
         this.memoryLimitInKb = memoryLimitInKb;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 26552eb..9a1d579 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -21,14 +21,25 @@ package org.apache.tika.parser.pkg;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 
+import java.io.BufferedWriter;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -39,7 +50,6 @@ public class CompressorParserTest extends TikaTest {
 
     @BeforeClass
     public static void setUp() {
-        NOT_COVERED.add(MediaType.application("x-brotli"));
         NOT_COVERED.add(MediaType.application("x-lz4-block"));
         NOT_COVERED.add(MediaType.application("x-snappy-raw"));
         NOT_COVERED.add(MediaType.application("deflate64"));
@@ -68,6 +78,16 @@ public class CompressorParserTest extends TikaTest {
     }
 
     @Test
+    public void testBrotli() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
+        List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
+
+        assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+
+    @Test
     public void testCoverage() throws Exception {
         //test that the package parser covers all inputstreams handled
         //by CompressorStreamFactory.  When we update commons-compress, and they add
diff --git a/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br
new file mode 100644
index 0000000..3769516
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 01/04: Fix for TIKA-2582 contributed by ewanmellor.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d1526d053f91497ac7bcd4509f1555f4347377d6
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Wed Feb 21 13:09:53 2018 -0800

    Fix for TIKA-2582 contributed by ewanmellor.
    
    Tesseract 4.0 includes a change to use form feed characters to separate
    pages by default in its text output. Previous versions used no separator
    unless you specified the include_page_breaks option.
    
    This confuses any parser that is not expecting the FF.
    ODFParserTest.testOO2Metadata fails, because it is expecting the output of
    a blank image to be the empty string, but now the FF is there.
    
    I haven't seen any other failures, but I expect that user code will now see
    either FF or U+FFFD where they are not expecting it (SafeContentHandler
    replaces the FF with U+FFFD when converting to text to XML).
    
    Fix this by setting Tesseract's page_separator option to the empty string.
    This will preserve the no-page-breaks behavior with both Tesseract 3.x and
    4.0.
    
    Also, add an option TesseractOCRConfig.pageSeparator so that user code can
    request the FF or any other separator, if they want it.
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 22 ++++++++++++++++++++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index afe0a21..4139cd2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -91,6 +91,9 @@ public class TesseractOCRConfig implements Serializable {
     // factor by which image is to be scaled.
     private int resize = 900;
 
+    // See setPageSeparator.
+    private String pageSeparator = "";
+
     // whether or not to preserve interword spacing
     private boolean preserveInterwordSpacing = false;
 
@@ -256,6 +259,25 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * @see #setPageSeparator(String pageSeparator)
+     */
+    public String getPageSeparator() {
+        return pageSeparator;
+    }
+
+    /**
+     * The page separator to use in plain text output.  This corresponds to Tesseract's page_separator config option.
+     * The default here is the empty string (i.e. no page separators).  Note that this is also the default in
+     * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character.  We are overriding
+     * Tesseract 4.0's default here.
+     *
+     * @param pageSeparator
+     */
+    public void setPageSeparator(String pageSeparator) {
+        this.pageSeparator = pageSeparator;
+    }
+
+    /**
      * Whether or not to maintain interword spacing.  Default is <code>false</code>.
      *
      * @param preserveInterwordSpacing
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 08847fd..3e15c44 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -468,6 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
                 config.getLanguage(), "-psm", config.getPageSegMode(),
                 config.getOutputType().name().toLowerCase(Locale.US),
+                "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
                 (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
         ProcessBuilder pb = new ProcessBuilder(cmd);

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 03/04: Fix for TIKA-2613 contributed by ewanmellor.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2efe3f97a7df0ac8863b225beb2deb41e99c1e90
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Mon Mar 26 16:25:31 2018 -0700

    Fix for TIKA-2613 contributed by ewanmellor.
    
    Change -psm on the Tesseract command line to --psm, with two dashes.
    This matches a change in Tesseract 4.0 to remove the one-dash version.
    It has been deprecated since Nov 2016.
    
    The Tesseract cset is ee201e1f4.
    
    Also, move the config file (i.e. getOutputType in Tika's terms) so that it
    is the last parameter on the command line.  Tesseract logs an error
    message (though otherwise doesn't fail) if the config file is not the
    last thing on the command line.
---
 .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 6bf2ab4..f274ce1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -468,8 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
         ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
                 config.getTesseractPath() + getTesseractProg(), input.getPath(),  output.getPath(), "-l",
-                config.getLanguage(), "-psm", config.getPageSegMode(),
-                config.getOutputType().name().toLowerCase(Locale.US)
+                config.getLanguage(), "--psm", config.getPageSegMode()
         ));
         for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
             cmd.add("-c");
@@ -478,7 +477,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         cmd.addAll(Arrays.asList(
                 "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"
+                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
+                config.getOutputType().name().toLowerCase(Locale.US)
         ));
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.