You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/02/26 04:46:18 UTC

svn commit: r1662350 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/strings/Latin1StringsParser.java test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java

Author: mattmann
Date: Thu Feb 26 03:46:17 2015
New Revision: 1662350

URL: http://svn.apache.org/r1662350
Log:
Fix for TIKA-1483 Create a Latin1 charset raw string parser contributed by Lius Filipe Nassif.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java?rev=1662350&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java Thu Feb 26 03:46:17 2015
@@ -0,0 +1,322 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser to extract printable Latin1 strings from arbitrary files with pure
+ * java. Useful for binary or unknown files, for files without a specific parser
+ * and for corrupted ones causing a TikaException as a fallback parser.
+ * 
+ * Currently the parser does a best effort to extract Latin1 strings, used by
+ * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets
+ * within the same file.
+ * 
+ * The implementation is optimized for fast parsing with only one pass.
+ */
+public class Latin1StringsParser extends AbstractParser {
+
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * The set of supported types
+     */
+    private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+    /**
+     * The valid ISO-8859-1 character map.
+     */
+    private static final boolean[] isChar = getCharMap();
+
+    /**
+     * The size of the internal buffers.
+     */
+    private static int BUF_SIZE = 64 * 1024;
+
+    /**
+     * The minimum size of a character sequence to be extracted.
+     */
+    private int minSize = 4;
+
+    /**
+     * The output buffer.
+     */
+    private byte[] output = new byte[BUF_SIZE];
+
+    /**
+     * The input buffer.
+     */
+    private byte[] input = new byte[BUF_SIZE];
+
+    /**
+     * The temporary position into the output buffer.
+     */
+    private int tmpPos = 0;
+
+    /**
+     * The current position into the output buffer.
+     */
+    private int outPos = 0;
+
+    /**
+     * The number of bytes into the input buffer.
+     */
+    private int inSize = 0;
+
+    /**
+     * The position into the input buffer.
+     */
+    private int inPos = 0;
+
+    /**
+     * The output content handler.
+     */
+    private XHTMLContentHandler xhtml;
+
+    /**
+     * Returns the minimum size of a character sequence to be extracted.
+     * 
+     * @return the minimum size of a character sequence
+     */
+    public int getMinSize() {
+        return minSize;
+    }
+
+    /**
+     * Sets the minimum size of a character sequence to be extracted.
+     * 
+     * @param minSize
+     *            the minimum size of a character sequence
+     */
+    public void setMinSize(int minSize) {
+        this.minSize = minSize;
+    }
+
+    /**
+     * Populates the valid ISO-8859-1 character map.
+     * 
+     * @return the valid ISO-8859-1 character map.
+     */
+    private static boolean[] getCharMap() {
+
+        boolean[] isChar = new boolean[256];
+        for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++)
+            if ((c >= 0x20 && c <= 0x7E)
+                    || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A
+                    || c == 0x0D || c == 0x09) {
+                isChar[c & 0xFF] = true;
+            }
+        return isChar;
+
+    }
+
+    /**
+     * Returns the set of supported types.
+     * 
+     * @return the set of supported types
+     */
+    private static Set<MediaType> getTypes() {
+        HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+        supportedTypes.add(MediaType.OCTET_STREAM);
+        return supportedTypes;
+    }
+
+    /**
+     * Tests if the byte is a ISO-8859-1 char.
+     * 
+     * @param c
+     *            the byte to test.
+     * 
+     * @return if the byte is a char.
+     */
+    private static final boolean isChar(byte c) {
+        return isChar[c & 0xFF];
+    }
+
+    /**
+     * Flushes the internal output buffer to the content handler.
+     * 
+     * @throws UnsupportedEncodingException
+     * @throws SAXException
+     */
+    private void flushBuffer() throws UnsupportedEncodingException,
+            SAXException {
+        if (tmpPos - outPos >= minSize)
+            outPos = tmpPos - minSize;
+
+        xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+        for (int k = 0; k < tmpPos - outPos; k++)
+            output[k] = output[outPos + k];
+        tmpPos = tmpPos - outPos;
+        outPos = 0;
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+     *      org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+     *      org.apache.tika.parser.ParseContext)
+     */
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException {
+        /*
+         * Creates a new instance because the object is not immutable.
+         */
+        new Latin1StringsParser().doParse(stream, handler, metadata, context);
+    }
+
+    /**
+     * Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
+     * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
+     * temporary buffer position is incremented. When an invalid char is read,
+     * the difference of the temporary and current buffer position is checked.
+     * If it is greater than the minimum string size, the current buffer
+     * position is updated to the temp position. If it is not, the temp position
+     * is reseted to the current position.
+     * 
+     * @param stream
+     *            the input stream.
+     * @param handler
+     *            the output content handler
+     * @param metadata
+     *            the metadata of the file
+     * @param context
+     *            the parsing context
+     * @throws IOException
+     *             if an io error occurs
+     * @throws SAXException
+     *             if a sax error occurs
+     */
+    private void doParse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException {
+
+        tmpPos = 0;
+        outPos = 0;
+
+        xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        int i = 0;
+        do {
+            inSize = 0;
+            while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
+                inSize += i;
+            }
+            inPos = 0;
+            while (inPos < inSize) {
+                byte c = input[inPos++];
+                boolean utf8 = false;
+                /*
+                 * Test for a possible UTF8 encoded char
+                 */
+                if (c == (byte) 0xC3) {
+                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+                            .read();
+                    /*
+                     * Test if the next byte is in the valid UTF8 range
+                     */
+                    if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
+                        utf8 = true;
+                        output[tmpPos++] = (byte) (c_ + 0x40);
+                    } else {
+                        output[tmpPos++] = c;
+                        c = c_;
+                    }
+                    if (tmpPos == BUF_SIZE)
+                        flushBuffer();
+
+                    /*
+                     * Test for a possible UTF8 encoded char
+                     */
+                } else if (c == (byte) 0xC2) {
+                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+                            .read();
+                    /*
+                     * Test if the next byte is in the valid UTF8 range
+                     */
+                    if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
+                        utf8 = true;
+                        output[tmpPos++] = c_;
+                    } else {
+                        output[tmpPos++] = c;
+                        c = c_;
+                    }
+                    if (tmpPos == BUF_SIZE)
+                        flushBuffer();
+                }
+                if (!utf8)
+                    /*
+                     * Test if the byte is a valid char.
+                     */
+                    if (isChar(c)) {
+                        output[tmpPos++] = c;
+                        if (tmpPos == BUF_SIZE)
+                            flushBuffer();
+                    } else {
+                        /*
+                         * Test if the byte is an invalid char, marking a string
+                         * end. If it is a zero, test 2 positions before or
+                         * ahead for a valid char, meaning it marks the
+                         * transition between ISO-8859-1 and UTF16 sequences.
+                         */
+                        if (c != 0
+                                || (inPos >= 3 && isChar(input[inPos - 3]))
+                                || (inPos + 1 < inSize && isChar(input[inPos + 1]))) {
+
+                            if (tmpPos - outPos >= minSize) {
+                                output[tmpPos++] = 0x0A;
+                                outPos = tmpPos;
+
+                                if (tmpPos == BUF_SIZE)
+                                    flushBuffer();
+                            } else
+                                tmpPos = outPos;
+
+                        }
+                    }
+            }
+        } while (i != -1 && !Thread.currentThread().isInterrupted());
+
+        if (tmpPos - outPos >= minSize) {
+            output[tmpPos++] = 0x0A;
+            outPos = tmpPos;
+        }
+        xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+        xhtml.endDocument();
+
+    }
+
+}

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java?rev=1662350&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java Thu Feb 26 03:46:17 2015
@@ -0,0 +1,70 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class Latin1StringsParserTest {
+
+    @Test
+    public void testParse() throws Exception {
+
+        String testStr = "These are Latin1 accented scripts: \u00C2 \u00C3 \u00C9 \u00DC \u00E2 \u00E3 \u00E9 \u00FC";
+        String smallStr = "ab";
+
+        byte[] iso8859Bytes = testStr.getBytes("ISO-8859-1");
+        byte[] utf8Bytes = testStr.getBytes("UTF-8");
+        byte[] utf16Bytes = testStr.getBytes("UTF-16");
+        byte[] zeros = new byte[10];
+        byte[] smallString = smallStr.getBytes("ISO-8859-1");
+        byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        baos.write(iso8859Bytes);
+        baos.write(zeros);
+        baos.write(utf8Bytes);
+        baos.write(trashBytes);
+        baos.write(utf16Bytes);
+        baos.write(zeros);
+        baos.write(smallString);
+
+        Parser parser = new Latin1StringsParser();
+        ContentHandler handler = new BodyContentHandler();
+
+        InputStream stream = new ByteArrayInputStream(baos.toByteArray());
+
+        try {
+            parser.parse(stream, handler, new Metadata(), new ParseContext());
+        } finally {
+            stream.close();
+        }
+
+        String result = handler.toString();
+        String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
+
+        // Test if result contains only the test string appended 3 times
+        assertTrue(result.equals(expected));
+    }
+}