You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/04/07 15:07:17 UTC
svn commit: r645489 - in /jackrabbit/trunk/jackrabbit-text-extractors/src:
main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java
test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
Author: mreutegg
Date: Mon Apr 7 06:07:16 2008
New Revision: 645489
URL: http://svn.apache.org/viewvc?rev=645489&view=rev
Log:
JCR-1523: [PATCH] png, apng, mng text extractor
Added:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (with props)
jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (with props)
Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java?rev=645489&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java Mon Apr 7 06:07:16 2008
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayReader;
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Text extractor for png/apng/mng images. This class extracts the text content
+ * from tEXt chunks.
+ * <p>can handle image with mime types
+ * (<code>image/png, image/apng, image/mng</code>)
+ */
+public class PngTextExtractor extends AbstractTextExtractor {
+
+ private static byte[] pngHeader = {-119, 80, 78, 71, 13, 10, 26, 10};
+ private static byte[] mngHeader = {-119, 77, 78, 71, 13, 10, 26, 10};
+ private static byte[] iendChunk = {73, 69, 78, 68};
+ private static byte[] tEXtChunk = {116, 69, 88, 116};
+
+ private static String separator = System.getProperty("line.separator");
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(PngTextExtractor.class);
+
+ /**
+ * Creates a new <code>PngTextExtractor</code> instance.
+ */
+ public PngTextExtractor() {
+ super(new String[]{"image/png", "image/apng", "image/mng"});
+ }
+
+ /**
+ * Returns a reader for the text content of the given png image. Returns an
+ * empty reader if the png document could not be parsed.
+ *
+ * @param stream png image
+ * @param type ignored
+ * @param encoding ignored
+ * @return reader for the text content of the given png image, or an empty
+ * reader if the image could not be parsed
+ * @throws IOException if the png image stream can not be closed
+ */
+ public Reader extractText(InputStream stream,
+ String type,
+ String encoding)
+ throws IOException {
+ try {
+ CharArrayWriter writer = new CharArrayWriter();
+ byte[] header = new byte[8];
+ stream.read(header);
+ if (!Arrays.equals(pngHeader, header) && (!Arrays.equals(mngHeader, header))) {
+ return new StringReader("");
+ }
+ byte[] length = new byte[4];
+ byte[] chunkType = new byte[4];
+
+ stream.read(length);
+ stream.read(chunkType);
+
+ String sep = "";
+ while (!Arrays.equals(chunkType, iendChunk)) {
+ if (Arrays.equals(chunkType, tEXtChunk)) {
+ byte[] txtBytes = new byte[calcLen(length)];
+ stream.read(txtBytes);
+ int nullPos = findOffset(txtBytes, (byte) 0);
+ String key = new String(txtBytes, 0, nullPos, "ISO-8859-1");
+ String value = new String(txtBytes, nullPos + 1, txtBytes.length - (nullPos + 1), "ISO-8859-1");
+ writer.write(key);
+ writer.write(": ");
+ writer.write(value);
+ writer.write(sep);
+ sep = separator;
+ } else {
+ stream.skip(calcLen(length));
+ }
+
+ stream.skip(4);
+ stream.read(length);
+ stream.read(chunkType);
+ }
+ return new CharArrayReader(writer.toCharArray());
+ } catch (IOException e) {
+ logger.warn("Failed to extract png text content", e);
+ return new StringReader("");
+ } finally {
+ stream.close();
+ }
+ }
+
+ private int calcLen(byte[] length) {
+ int len = 0x00FF & length[0];
+ len <<= 8;
+ len |= 0x00FF & length[1];
+ len <<= 8;
+ len |= 0x00FF & length[2];
+ len <<= 8;
+ len |= 0x00FF & length[3];
+ return len;
+ }
+
+ int findOffset(byte[] data, byte val) {
+ for (int i = 0; i < data.length; i++) {
+ if (data[i] == val) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+}
Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java?rev=645489&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java Mon Apr 7 06:07:16 2008
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+public class PngTextExtractorTest extends TestCase {
+
+ /**
+ * Text extractor being tested.
+ */
+ private TextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ extractor = new PngTextExtractor();
+ }
+
+ /**
+ * Tests that the extractor supportes <code>image/png</code>,
+ * <code>image/apng</code> and <code>image/mng</code>.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue("PngTextExtractor does not support image/png",
+ types.contains("image/png"));
+ assertTrue("PngTextExtractor does not support image/apng",
+ types.contains("image/apng"));
+ assertTrue("PngTextExtractor does not support image/mng",
+ types.contains("image/mng"));
+ assertEquals("PngTextExtractor supports unknown content types",
+ 3, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles an empty stream.
+ */
+ public void testEmptyStream() {
+ try {
+ Reader reader = extractor.extractText(new ByteArrayInputStream(new byte[0]), "image/png", null);
+ assertEquals("", ExtractorHelper.read(reader));
+ } catch (IOException e) {
+ fail("PngTextExtractor does not handle empty streams");
+ }
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ byte[] png = {-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82,
+ 0, 0, 0, 1, 0, 0, 0, 1, 8, 6, 0, 0, 0, 31, 21, -60,
+ -119, 0, 0, 0, 6, 98, 75, 71, 68, 0, -1, 0, -1, 0, -1, -96,
+ -67, -89, -109, 0, 0, 0, 9, 112, 72, 89, 115, 0, 0, 11, 19, 0,
+ 0, 11, 19, 1, 0, -102, -100, 24, 0, 0, 0, 7, 116, 73, 77, 69,
+ 7, -40, 4, 6, 5, 59, 15, 72, -108, -3, -68, 0, 0, 0, 52, 116,
+ 69, 88, 116, 67, 111, 109, 109, 101, 110, 116, 0, 84, 104, 101, 32, 113,
+ 117, 105, 99, 107, 32, 98, 114, 111, 119, 110, 32, 102, 111, 120, 32, 106,
+ 117, 109, 112, 115, 32, 111, 118, 101, 114, 32, 116, 104, 101, 32, 108, 97,
+ 122, 121, 32, 100, 111, 103, 46, 55, 79, -28, -66, 0, 0, 0, 13, 73,
+ 68, 65, 84, 8, -41, 99, -8, -33, -64, -16, 31, 0, 6, -128, 2, 127,
+ -21, 73, 116, -101, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126};
+
+ Reader reader = extractor.extractText(new ByteArrayInputStream(png), "image/png", null);
+ assertEquals("Comment: The quick brown fox jumps over the lazy dog.", ExtractorHelper.read(reader));
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
------------------------------------------------------------------------------
svn:eol-style = native