You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/04/07 15:07:17 UTC

svn commit: r645489 - in /jackrabbit/trunk/jackrabbit-text-extractors/src: main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java

Author: mreutegg
Date: Mon Apr  7 06:07:16 2008
New Revision: 645489

URL: http://svn.apache.org/viewvc?rev=645489&view=rev
Log:
JCR-1523: [PATCH] png, apng, mng text extractor

Added:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java   (with props)

Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java?rev=645489&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java Mon Apr  7 06:07:16 2008
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayReader;
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Text extractor for png/apng/mng images. This class extracts the text content
+ * from tEXt chunks.
+ * <p>can handle image with mime types
+ * (<code>image/png, image/apng, image/mng</code>)
+ */
+public class PngTextExtractor extends AbstractTextExtractor {
+
+    private static byte[] pngHeader = {-119, 80, 78, 71, 13, 10, 26, 10};
+    private static byte[] mngHeader = {-119, 77, 78, 71, 13, 10, 26, 10};
+    private static byte[] iendChunk = {73, 69, 78, 68};
+    private static byte[] tEXtChunk = {116, 69, 88, 116};
+
+    private static String separator = System.getProperty("line.separator");
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+            LoggerFactory.getLogger(PngTextExtractor.class);
+
+    /**
+     * Creates a new <code>PngTextExtractor</code> instance.
+     */
+    public PngTextExtractor() {
+        super(new String[]{"image/png", "image/apng", "image/mng"});
+    }
+
+    /**
+     * Returns a reader for the text content of the given png image. Returns an
+     * empty reader if the png document could not be parsed.
+     *
+     * @param stream   png image
+     * @param type     ignored
+     * @param encoding ignored
+     * @return reader for the text content of the given png image, or an empty
+     *         reader if the image could not be parsed
+     * @throws IOException if the png image stream can not be closed
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding)
+            throws IOException {
+        try {
+            CharArrayWriter writer = new CharArrayWriter();
+            byte[] header = new byte[8];
+            stream.read(header);
+            if (!Arrays.equals(pngHeader, header) && (!Arrays.equals(mngHeader, header))) {
+                return new StringReader("");
+            }
+            byte[] length = new byte[4];
+            byte[] chunkType = new byte[4];
+
+            stream.read(length);
+            stream.read(chunkType);
+
+            String sep = "";
+            while (!Arrays.equals(chunkType, iendChunk)) {
+                if (Arrays.equals(chunkType, tEXtChunk)) {
+                    byte[] txtBytes = new byte[calcLen(length)];
+                    stream.read(txtBytes);
+                    int nullPos = findOffset(txtBytes, (byte) 0);
+                    String key = new String(txtBytes, 0, nullPos, "ISO-8859-1");
+                    String value = new String(txtBytes, nullPos + 1, txtBytes.length - (nullPos + 1), "ISO-8859-1");
+                    writer.write(key);
+                    writer.write(": ");
+                    writer.write(value);
+                    writer.write(sep);
+                    sep = separator;
+                } else {
+                    stream.skip(calcLen(length));
+                }
+
+                stream.skip(4);
+                stream.read(length);
+                stream.read(chunkType);
+            }
+            return new CharArrayReader(writer.toCharArray());
+        } catch (IOException e) {
+            logger.warn("Failed to extract png text content", e);
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+
+    private int calcLen(byte[] length) {
+        int len = 0x00FF & length[0];
+        len <<= 8;
+        len |= 0x00FF & length[1];
+        len <<= 8;
+        len |= 0x00FF & length[2];
+        len <<= 8;
+        len |= 0x00FF & length[3];
+        return len;
+    }
+
+    int findOffset(byte[] data, byte val) {
+        for (int i = 0; i < data.length; i++) {
+            if (data[i] == val) {
+                return i;
+            }
+        }
+
+        return -1;
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java?rev=645489&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java Mon Apr  7 06:07:16 2008
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+public class PngTextExtractorTest extends TestCase {
+
+    /**
+     * Text extractor being tested.
+     */
+    private TextExtractor extractor;
+
+    /**
+     * Creates the text extractor to be tested.
+     */
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = new PngTextExtractor();
+    }
+
+    /**
+     * Tests that the extractor supportes <code>image/png</code>,
+     * <code>image/apng</code> and <code>image/mng</code>.
+     */
+    public void testContentTypes() {
+        Set types = new HashSet();
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+        assertTrue("PngTextExtractor does not support image/png",
+                types.contains("image/png"));
+        assertTrue("PngTextExtractor does not support image/apng",
+                types.contains("image/apng"));
+        assertTrue("PngTextExtractor does not support image/mng",
+                types.contains("image/mng"));
+        assertEquals("PngTextExtractor supports unknown content types",
+                3, types.size());
+    }
+
+    /**
+     * Tests that the extractor correctly handles an empty stream.
+     */
+    public void testEmptyStream() {
+        try {
+            Reader reader = extractor.extractText(new ByteArrayInputStream(new byte[0]), "image/png", null);
+            assertEquals("", ExtractorHelper.read(reader));
+        } catch (IOException e) {
+            fail("PngTextExtractor does not handle empty streams");
+        }
+    }
+
+    /**
+     * Tests that the extractor correctly handles a normal stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testNormalStream() throws IOException {
+        byte[] png = {-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82,
+                      0, 0, 0, 1, 0, 0, 0, 1, 8, 6, 0, 0, 0, 31, 21, -60,
+                      -119, 0, 0, 0, 6, 98, 75, 71, 68, 0, -1, 0, -1, 0, -1, -96,
+                      -67, -89, -109, 0, 0, 0, 9, 112, 72, 89, 115, 0, 0, 11, 19, 0,
+                      0, 11, 19, 1, 0, -102, -100, 24, 0, 0, 0, 7, 116, 73, 77, 69,
+                      7, -40, 4, 6, 5, 59, 15, 72, -108, -3, -68, 0, 0, 0, 52, 116,
+                      69, 88, 116, 67, 111, 109, 109, 101, 110, 116, 0, 84, 104, 101, 32, 113,
+                      117, 105, 99, 107, 32, 98, 114, 111, 119, 110, 32, 102, 111, 120, 32, 106,
+                      117, 109, 112, 115, 32, 111, 118, 101, 114, 32, 116, 104, 101, 32, 108, 97,
+                      122, 121, 32, 100, 111, 103, 46, 55, 79, -28, -66, 0, 0, 0, 13, 73,
+                      68, 65, 84, 8, -41, 99, -8, -33, -64, -16, 31, 0, 6, -128, 2, 127,
+                      -21, 73, 116, -101, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126};
+
+        Reader reader = extractor.extractText(new ByteArrayInputStream(png), "image/png", null);
+        assertEquals("Comment: The quick brown fox jumps over the lazy dog.", ExtractorHelper.read(reader));
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native