You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:29 UTC
[23/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
index 101b26b..51dc5a5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
@@ -1,327 +1,327 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.util.concurrent.CancellationException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
-import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-public class ChmLzxState implements Cloneable {
- /* Class' members */
- private int window; /* the actual decoding window */
- private long window_size; /* window size (32Kb through 2Mb) */
- private int window_position; /* current offset within the window */
- private int main_tree_elements; /* number of main tree elements */
- private LzxState hadStarted; /* have we started decoding at all yet? */
- private int block_type; /* type of this block */
- private int block_length; /* uncompressed length of this block */
- private int block_remaining; /* uncompressed bytes still left to decode */
- private int frames_read; /* the number of CFDATA blocks processed */
- private int intel_file_size; /* magic header value used for transform */
- private long intel_current_possition; /* current offset in transform space */
- private IntelState intel_state; /* have we seen any translatable data yet? */
- private long R0; /* for the LRU offset system */
- private long R1; /* for the LRU offset system */
- private long R2; /* for the LRU offset system */
-
- // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
- protected short[] mainTreeLengtsTable;
- protected short[] mainTreeTable;
-
- protected short[] lengthTreeTable;
- protected short[] lengthTreeLengtsTable;
-
- protected short[] alignedLenTable;
- protected short[] alignedTreeTable;
-
- @Override
- public ChmLzxState clone() {
- try {
- ChmLzxState clone = (ChmLzxState)super.clone();
- clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
- clone.mainTreeTable = arrayClone(mainTreeTable);
- clone.lengthTreeTable = arrayClone(lengthTreeTable);
- clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
- clone.alignedLenTable = arrayClone(alignedLenTable);
- clone.alignedTreeTable = arrayClone(alignedTreeTable);
- return clone;
- } catch (CloneNotSupportedException ex) {
- return null;
- }
- }
-
- protected short[] getMainTreeTable() {
- return mainTreeTable;
- }
-
- protected short[] getAlignedTreeTable() {
- return alignedTreeTable;
- }
-
- protected void setAlignedTreeTable(short[] alignedTreeTable) {
- this.alignedTreeTable = alignedTreeTable;
- }
-
- protected short[] getLengthTreeTable() throws TikaException {
- if (lengthTreeTable != null)
- return this.lengthTreeTable;
- else
- throw new ChmParsingException("lengthTreeTable is null");
- }
-
- protected void setLengthTreeTable(short[] lengthTreeTable) {
- this.lengthTreeTable = lengthTreeTable;
- }
-
- protected void setMainTreeTable(short[] mainTreeTable) {
- this.mainTreeTable = mainTreeTable;
- }
-
- protected short[] getAlignedLenTable() {
- return this.alignedLenTable;
- }
-
- protected void setAlignedLenTable(short[] alignedLenTable) {
- this.alignedLenTable = alignedLenTable;
- }
-
- /**
- * It suits for informative outlook
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("actual decoding window:=" + getWindow()
- + System.getProperty("line.separator"));
- sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
- + System.getProperty("line.separator"));
- sb.append("current offset within the window:=" + getWindowPosition()
- + System.getProperty("line.separator"));
- sb.append("number of main tree elements:=" + getMainTreeElements()
- + System.getProperty("line.separator"));
- sb.append("have we started decoding at all yet?:=" + getHadStarted()
- + System.getProperty("line.separator"));
- sb.append("type of this block:=" + getBlockType()
- + System.getProperty("line.separator"));
- sb.append("uncompressed length of this block:=" + getBlockLength()
- + System.getProperty("line.separator"));
- sb.append("uncompressed bytes still left to decode:="
- + getBlockRemaining() + System.getProperty("line.separator"));
- sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
- + System.getProperty("line.separator"));
- sb.append("magic header value used for transform:="
- + getIntelFileSize() + System.getProperty("line.separator"));
- sb.append("current offset in transform space:="
- + getIntelCurrentPossition()
- + System.getProperty("line.separator"));
- sb.append("have we seen any translatable data yet?:=" + getIntelState()
- + System.getProperty("line.separator"));
- sb.append("R0 for the LRU offset system:=" + getR0()
- + System.getProperty("line.separator"));
- sb.append("R1 for the LRU offset system:=" + getR1()
- + System.getProperty("line.separator"));
- sb.append("R2 for the LRU offset system:=" + getR2()
- + System.getProperty("line.separator"));
- sb.append("main tree length:=" + getMainTreeLengtsTable().length
- + System.getProperty("line.separator"));
- sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- public ChmLzxState(int window) throws TikaException {
- if (window >= 0) {
- int position_slots;
- int win = ChmCommons.getWindowSize(window);
- setWindowSize(1 << win);
- /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
- if (win < 15 || win > 21)
- throw new ChmParsingException("window less than 15 or window greater than 21");
-
- /* Calculates required position slots */
- if (win == 20)
- position_slots = 42;
- else if (win == 21)
- position_slots = 50;
- else
- position_slots = win << 1;
- //TODO: position_slots is not used ?
- setR0(1);
- setR1(1);
- setR2(1);
- setMainTreeElements(512);
- setHadStarted(LzxState.NOT_STARTED_DECODING);
- setFramesRead(0);
- setBlockRemaining(0);
- setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
- setIntelCurrentPossition(0);
- setIntelState(IntelState.NOT_STARTED);
- setWindowPosition(0);
- setMainTreeLengtsTable(new short[getMainTreeElements()]);
- setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
- } else
- throw new CancellationException(
- "window size should be more than zero");
- }
-
- protected void setWindow(int window) {
- this.window = window;
- }
-
- protected int getWindow() {
- return window;
- }
-
- protected void setWindowSize(long window_size) {
- this.window_size = window_size;
- }
-
- protected long getWindowSize() {
- return window_size;
- }
-
- protected void setWindowPosition(int window_position) {
- this.window_position = window_position;
- }
-
- protected int getWindowPosition() {
- return window_position;
- }
-
- protected void setMainTreeElements(int main_tree_elements) {
- this.main_tree_elements = main_tree_elements;
- }
-
- protected int getMainTreeElements() {
- return main_tree_elements;
- }
-
- protected void setHadStarted(LzxState hadStarted) {
- this.hadStarted = hadStarted;
- }
-
- protected LzxState getHadStarted() {
- return hadStarted;
- }
-
- protected void setBlockType(int block_type) {
- this.block_type = block_type;
- }
-
- public int getBlockType() {
- return block_type;
- }
-
- protected void setBlockLength(int block_length) {
- this.block_length = block_length;
- }
-
- protected int getBlockLength() {
- return block_length;
- }
-
- protected void setBlockRemaining(int block_remaining) {
- this.block_remaining = block_remaining;
- }
-
- protected int getBlockRemaining() {
- return block_remaining;
- }
-
- protected void setFramesRead(int frames_read) {
- this.frames_read = frames_read;
- }
-
- protected void increaseFramesRead() {
- this.frames_read = getFramesRead() + 1;
- }
-
- protected int getFramesRead() {
- return frames_read;
- }
-
- protected void setIntelFileSize(int intel_file_size) {
- this.intel_file_size = intel_file_size;
- }
-
- protected int getIntelFileSize() {
- return intel_file_size;
- }
-
- protected void setIntelCurrentPossition(long intel_current_possition) {
- this.intel_current_possition = intel_current_possition;
- }
-
- protected long getIntelCurrentPossition() {
- return intel_current_possition;
- }
-
- protected void setIntelState(IntelState intel_state) {
- this.intel_state = intel_state;
- }
-
- protected IntelState getIntelState() {
- return intel_state;
- }
-
- protected void setR0(long r0) {
- R0 = r0;
- }
-
- protected long getR0() {
- return R0;
- }
-
- protected void setR1(long r1) {
- R1 = r1;
- }
-
- protected long getR1() {
- return R1;
- }
-
- protected void setR2(long r2) {
- R2 = r2;
- }
-
- protected long getR2() {
- return R2;
- }
-
- public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
- this.mainTreeLengtsTable = mainTreeLengtsTable;
- }
-
- public short[] getMainTreeLengtsTable() {
- return mainTreeLengtsTable;
- }
-
- public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
- this.lengthTreeLengtsTable = lengthTreeLengtsTable;
- }
-
- public short[] getLengthTreeLengtsTable() {
- return lengthTreeLengtsTable;
- }
-
- private static short[] arrayClone(short[] a) {
- return a==null ? null : (short[]) a.clone();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+ /* Class' members */
+ private int window; /* the actual decoding window */
+ private long window_size; /* window size (32Kb through 2Mb) */
+ private int window_position; /* current offset within the window */
+ private int main_tree_elements; /* number of main tree elements */
+ private LzxState hadStarted; /* have we started decoding at all yet? */
+ private int block_type; /* type of this block */
+ private int block_length; /* uncompressed length of this block */
+ private int block_remaining; /* uncompressed bytes still left to decode */
+ private int frames_read; /* the number of CFDATA blocks processed */
+ private int intel_file_size; /* magic header value used for transform */
+ private long intel_current_possition; /* current offset in transform space */
+ private IntelState intel_state; /* have we seen any translatable data yet? */
+ private long R0; /* for the LRU offset system */
+ private long R1; /* for the LRU offset system */
+ private long R2; /* for the LRU offset system */
+
+ // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+ protected short[] mainTreeLengtsTable;
+ protected short[] mainTreeTable;
+
+ protected short[] lengthTreeTable;
+ protected short[] lengthTreeLengtsTable;
+
+ protected short[] alignedLenTable;
+ protected short[] alignedTreeTable;
+
+ @Override
+ public ChmLzxState clone() {
+ try {
+ ChmLzxState clone = (ChmLzxState)super.clone();
+ clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+ clone.mainTreeTable = arrayClone(mainTreeTable);
+ clone.lengthTreeTable = arrayClone(lengthTreeTable);
+ clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+ clone.alignedLenTable = arrayClone(alignedLenTable);
+ clone.alignedTreeTable = arrayClone(alignedTreeTable);
+ return clone;
+ } catch (CloneNotSupportedException ex) {
+ return null;
+ }
+ }
+
+ protected short[] getMainTreeTable() {
+ return mainTreeTable;
+ }
+
+ protected short[] getAlignedTreeTable() {
+ return alignedTreeTable;
+ }
+
+ protected void setAlignedTreeTable(short[] alignedTreeTable) {
+ this.alignedTreeTable = alignedTreeTable;
+ }
+
+ protected short[] getLengthTreeTable() throws TikaException {
+ if (lengthTreeTable != null)
+ return this.lengthTreeTable;
+ else
+ throw new ChmParsingException("lengthTreeTable is null");
+ }
+
+ protected void setLengthTreeTable(short[] lengthTreeTable) {
+ this.lengthTreeTable = lengthTreeTable;
+ }
+
+ protected void setMainTreeTable(short[] mainTreeTable) {
+ this.mainTreeTable = mainTreeTable;
+ }
+
+ protected short[] getAlignedLenTable() {
+ return this.alignedLenTable;
+ }
+
+ protected void setAlignedLenTable(short[] alignedLenTable) {
+ this.alignedLenTable = alignedLenTable;
+ }
+
+ /**
+ * It suits for informative outlook
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("actual decoding window:=" + getWindow()
+ + System.getProperty("line.separator"));
+ sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+ + System.getProperty("line.separator"));
+ sb.append("current offset within the window:=" + getWindowPosition()
+ + System.getProperty("line.separator"));
+ sb.append("number of main tree elements:=" + getMainTreeElements()
+ + System.getProperty("line.separator"));
+ sb.append("have we started decoding at all yet?:=" + getHadStarted()
+ + System.getProperty("line.separator"));
+ sb.append("type of this block:=" + getBlockType()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed length of this block:=" + getBlockLength()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed bytes still left to decode:="
+ + getBlockRemaining() + System.getProperty("line.separator"));
+ sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+ + System.getProperty("line.separator"));
+ sb.append("magic header value used for transform:="
+ + getIntelFileSize() + System.getProperty("line.separator"));
+ sb.append("current offset in transform space:="
+ + getIntelCurrentPossition()
+ + System.getProperty("line.separator"));
+ sb.append("have we seen any translatable data yet?:=" + getIntelState()
+ + System.getProperty("line.separator"));
+ sb.append("R0 for the LRU offset system:=" + getR0()
+ + System.getProperty("line.separator"));
+ sb.append("R1 for the LRU offset system:=" + getR1()
+ + System.getProperty("line.separator"));
+ sb.append("R2 for the LRU offset system:=" + getR2()
+ + System.getProperty("line.separator"));
+ sb.append("main tree length:=" + getMainTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ public ChmLzxState(int window) throws TikaException {
+ if (window >= 0) {
+ int position_slots;
+ int win = ChmCommons.getWindowSize(window);
+ setWindowSize(1 << win);
+ /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+ if (win < 15 || win > 21)
+ throw new ChmParsingException("window less than 15 or window greater than 21");
+
+ /* Calculates required position slots */
+ if (win == 20)
+ position_slots = 42;
+ else if (win == 21)
+ position_slots = 50;
+ else
+ position_slots = win << 1;
+ //TODO: position_slots is not used ?
+ setR0(1);
+ setR1(1);
+ setR2(1);
+ setMainTreeElements(512);
+ setHadStarted(LzxState.NOT_STARTED_DECODING);
+ setFramesRead(0);
+ setBlockRemaining(0);
+ setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+ setIntelCurrentPossition(0);
+ setIntelState(IntelState.NOT_STARTED);
+ setWindowPosition(0);
+ setMainTreeLengtsTable(new short[getMainTreeElements()]);
+ setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+ } else
+ throw new CancellationException(
+ "window size should be more than zero");
+ }
+
+ protected void setWindow(int window) {
+ this.window = window;
+ }
+
+ protected int getWindow() {
+ return window;
+ }
+
+ protected void setWindowSize(long window_size) {
+ this.window_size = window_size;
+ }
+
+ protected long getWindowSize() {
+ return window_size;
+ }
+
+ protected void setWindowPosition(int window_position) {
+ this.window_position = window_position;
+ }
+
+ protected int getWindowPosition() {
+ return window_position;
+ }
+
+ protected void setMainTreeElements(int main_tree_elements) {
+ this.main_tree_elements = main_tree_elements;
+ }
+
+ protected int getMainTreeElements() {
+ return main_tree_elements;
+ }
+
+ protected void setHadStarted(LzxState hadStarted) {
+ this.hadStarted = hadStarted;
+ }
+
+ protected LzxState getHadStarted() {
+ return hadStarted;
+ }
+
+ protected void setBlockType(int block_type) {
+ this.block_type = block_type;
+ }
+
+ public int getBlockType() {
+ return block_type;
+ }
+
+ protected void setBlockLength(int block_length) {
+ this.block_length = block_length;
+ }
+
+ protected int getBlockLength() {
+ return block_length;
+ }
+
+ protected void setBlockRemaining(int block_remaining) {
+ this.block_remaining = block_remaining;
+ }
+
+ protected int getBlockRemaining() {
+ return block_remaining;
+ }
+
+ protected void setFramesRead(int frames_read) {
+ this.frames_read = frames_read;
+ }
+
+ protected void increaseFramesRead() {
+ this.frames_read = getFramesRead() + 1;
+ }
+
+ protected int getFramesRead() {
+ return frames_read;
+ }
+
+ protected void setIntelFileSize(int intel_file_size) {
+ this.intel_file_size = intel_file_size;
+ }
+
+ protected int getIntelFileSize() {
+ return intel_file_size;
+ }
+
+ protected void setIntelCurrentPossition(long intel_current_possition) {
+ this.intel_current_possition = intel_current_possition;
+ }
+
+ protected long getIntelCurrentPossition() {
+ return intel_current_possition;
+ }
+
+ protected void setIntelState(IntelState intel_state) {
+ this.intel_state = intel_state;
+ }
+
+ protected IntelState getIntelState() {
+ return intel_state;
+ }
+
+ protected void setR0(long r0) {
+ R0 = r0;
+ }
+
+ protected long getR0() {
+ return R0;
+ }
+
+ protected void setR1(long r1) {
+ R1 = r1;
+ }
+
+ protected long getR1() {
+ return R1;
+ }
+
+ protected void setR2(long r2) {
+ R2 = r2;
+ }
+
+ protected long getR2() {
+ return R2;
+ }
+
+ public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+ this.mainTreeLengtsTable = mainTreeLengtsTable;
+ }
+
+ public short[] getMainTreeLengtsTable() {
+ return mainTreeLengtsTable;
+ }
+
+ public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+ this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+ }
+
+ public short[] getLengthTreeLengtsTable() {
+ return lengthTreeLengtsTable;
+ }
+
+ private static short[] arrayClone(short[] a) {
+ return a==null ? null : (short[]) a.clone();
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
index c8944be..77f9b3a 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
@@ -1,222 +1,222 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-
-public class ChmSection {
- final private byte[] data;
- final private byte[] prevcontent;
- private int swath;// kiks
- private int total;// remains
- private int buffer;// val
-
- public ChmSection(byte[] data) throws TikaException {
- this(data, null);
- }
-
- public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
- ChmCommons.assertByteArrayNotNull(data);
- this.data = data;
- this.prevcontent = prevconent;
- //setData(data);
- }
-
- /* Utilities */
- public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
- ChmCommons.assertByteArrayNotNull(toBeReversed);
- ChmCommons.reverse(toBeReversed);
- return toBeReversed;
- }
-
- public int checkBit(int i) {
- return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
- }
-
- public int getSyncBits(int bit) {
- return getDesyncBits(bit, bit);
- }
-
- public int peekBits(int bit) {
- return getDesyncBits(bit, 0);
- }
-
- private int getDesyncBits(int bit, int removeBit) {
- while (getTotal() < 16) {
- setBuffer((getBuffer() << 16) + unmarshalUByte()
- + (unmarshalUByte() << 8));
- setTotal(getTotal() + 16);
- }
- int tmp = (getBuffer() >>> (getTotal() - bit));
- setTotal(getTotal() - removeBit);
- setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
- return tmp;
- }
-
- public int unmarshalUByte() {
- return getByte() & 255;
- }
-
- public byte getByte() {
- if (getSwath() < getData().length) {
- setSwath(getSwath() + 1);
- return getData()[getSwath() - 1];
- } else
- return 0;
- }
-
- public int getLeft() {
- return (getData().length - getSwath());
- }
-
- public byte[] getData() {
- return data;
- }
-
- public byte[] getPrevContent() {
- return prevcontent;
- }
-
- public BigInteger getBigInteger(int i) {
- if (getData() == null)
- return BigInteger.ZERO;
- if (getData().length - getSwath() < i)
- i = getData().length - getSwath();
- byte[] tmp = new byte[i];
- for (int j = i - 1; j >= 0; j--) {
- tmp[i - j - 1] = getData()[getSwath() + j];
- }
- setSwath(getSwath() + i);
- return new BigInteger(tmp);
- }
-
- public byte[] stringToAsciiBytes(String s) {
- char[] c = s.toCharArray();
- byte[] byteval = new byte[c.length];
- for (int i = 0; i < c.length; i++)
- byteval[i] = (byte) c[i];
- return byteval;
- }
-
- public BigInteger unmarshalUlong() {
- return getBigInteger(8);
- }
-
- public long unmarshalUInt() {
- return getBigInteger(4).longValue();
- }
-
- public int unmarshalInt() {
- return getBigInteger(4).intValue();
- }
-
- public byte[] unmarshalBytes(int i) {
- if (i == 0)
- return new byte[1];
- byte[] t = new byte[i];
- for (int j = 0; j < i; j++)
- t[j] = getData()[j + getSwath()];
- setSwath(getSwath() + i);
- return t;
- }
-
- public BigInteger getEncint() {
- byte ob;
- BigInteger bi = BigInteger.ZERO;
- byte[] nb = new byte[1];
- while ((ob = this.getByte()) < 0) {
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- }
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- return bi;
- }
-
- public char unmarshalUtfChar() {
- byte ob;
- int i = 1;
- byte[] ba;
- ob = this.getByte();
- if (ob < 0) {
- i = 2;
- while ((ob << (24 + i)) < 0)
- i++;
- }
- ba = new byte[i];
- ba[0] = ob;
- int j = 1;
- while (j < i) {
- ba[j] = this.getByte();
- j++;
- }
- i = ba.length;
- if (i == 1)
- return (char) ba[0];
- else {
- int n;
- n = ba[0] & 15; // 00001111b, gets last 4 bits
- j = 1;
- while (j < i)
- n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
- return (char) n;
- }
- }
-
-// private void setData(byte[] data) {
-// this.data = data;
-// }
-
- public int getSwath() {
- return swath;
- }
-
- public void setSwath(int swath) {
- this.swath = swath;
- }
-
- public int getTotal() {
- return total;
- }
-
- public void setTotal(int total) {
- this.total = total;
- }
-
- private int getBuffer() {
- return buffer;
- }
-
- private void setBuffer(int buffer) {
- this.buffer = buffer;
- }
-
- /**
- * @param args
- * @throws TikaException
- */
- public static void main(String[] args) throws TikaException {
- byte[] array = { 4, 78, -67, 90, 1, -33 };
- ChmSection chmSection = new ChmSection(array);
- System.out.println("before " + Arrays.toString(array));
- System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+ final private byte[] data;
+ final private byte[] prevcontent;
+ private int swath;// kiks
+ private int total;// remains
+ private int buffer;// val
+
+ public ChmSection(byte[] data) throws TikaException {
+ this(data, null);
+ }
+
+ public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(data);
+ this.data = data;
+ this.prevcontent = prevconent;
+ //setData(data);
+ }
+
+ /* Utilities */
+ public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(toBeReversed);
+ ChmCommons.reverse(toBeReversed);
+ return toBeReversed;
+ }
+
+ public int checkBit(int i) {
+ return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+ }
+
+ public int getSyncBits(int bit) {
+ return getDesyncBits(bit, bit);
+ }
+
+ public int peekBits(int bit) {
+ return getDesyncBits(bit, 0);
+ }
+
+ private int getDesyncBits(int bit, int removeBit) {
+ while (getTotal() < 16) {
+ setBuffer((getBuffer() << 16) + unmarshalUByte()
+ + (unmarshalUByte() << 8));
+ setTotal(getTotal() + 16);
+ }
+ int tmp = (getBuffer() >>> (getTotal() - bit));
+ setTotal(getTotal() - removeBit);
+ setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+ return tmp;
+ }
+
+ public int unmarshalUByte() {
+ return getByte() & 255;
+ }
+
+ public byte getByte() {
+ if (getSwath() < getData().length) {
+ setSwath(getSwath() + 1);
+ return getData()[getSwath() - 1];
+ } else
+ return 0;
+ }
+
+ public int getLeft() {
+ return (getData().length - getSwath());
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ public byte[] getPrevContent() {
+ return prevcontent;
+ }
+
+ public BigInteger getBigInteger(int i) {
+ if (getData() == null)
+ return BigInteger.ZERO;
+ if (getData().length - getSwath() < i)
+ i = getData().length - getSwath();
+ byte[] tmp = new byte[i];
+ for (int j = i - 1; j >= 0; j--) {
+ tmp[i - j - 1] = getData()[getSwath() + j];
+ }
+ setSwath(getSwath() + i);
+ return new BigInteger(tmp);
+ }
+
+ public byte[] stringToAsciiBytes(String s) {
+ char[] c = s.toCharArray();
+ byte[] byteval = new byte[c.length];
+ for (int i = 0; i < c.length; i++)
+ byteval[i] = (byte) c[i];
+ return byteval;
+ }
+
+ public BigInteger unmarshalUlong() {
+ return getBigInteger(8);
+ }
+
+ public long unmarshalUInt() {
+ return getBigInteger(4).longValue();
+ }
+
+ public int unmarshalInt() {
+ return getBigInteger(4).intValue();
+ }
+
+ public byte[] unmarshalBytes(int i) {
+ if (i == 0)
+ return new byte[1];
+ byte[] t = new byte[i];
+ for (int j = 0; j < i; j++)
+ t[j] = getData()[j + getSwath()];
+ setSwath(getSwath() + i);
+ return t;
+ }
+
+ public BigInteger getEncint() {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+ while ((ob = this.getByte()) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ return bi;
+ }
+
+ public char unmarshalUtfChar() {
+ byte ob;
+ int i = 1;
+ byte[] ba;
+ ob = this.getByte();
+ if (ob < 0) {
+ i = 2;
+ while ((ob << (24 + i)) < 0)
+ i++;
+ }
+ ba = new byte[i];
+ ba[0] = ob;
+ int j = 1;
+ while (j < i) {
+ ba[j] = this.getByte();
+ j++;
+ }
+ i = ba.length;
+ if (i == 1)
+ return (char) ba[0];
+ else {
+ int n;
+ n = ba[0] & 15; // 00001111b, gets last 4 bits
+ j = 1;
+ while (j < i)
+ n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+ return (char) n;
+ }
+ }
+
+// private void setData(byte[] data) {
+// this.data = data;
+// }
+
+ public int getSwath() {
+ return swath;
+ }
+
+ public void setSwath(int swath) {
+ this.swath = swath;
+ }
+
+ public int getTotal() {
+ return total;
+ }
+
+ public void setTotal(int total) {
+ this.total = total;
+ }
+
+ private int getBuffer() {
+ return buffer;
+ }
+
+ private void setBuffer(int buffer) {
+ this.buffer = buffer;
+ }
+
+ /**
+ * @param args
+ * @throws TikaException
+ */
+ public static void main(String[] args) throws TikaException {
+ byte[] array = { 4, 78, -67, 90, 1, -33 };
+ ChmSection chmSection = new ChmSection(array);
+ System.out.println("before " + Arrays.toString(array));
+ System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 0e0e3da..86b1dd4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -1,209 +1,209 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
- * DelegatingParser to process each mail.
- */
-public class MboxParser extends AbstractParser {
-
- public static final String MBOX_MIME_TYPE = "application/mbox";
- public static final String MBOX_RECORD_DIVIDER = "From ";
- public static final int MAIL_MAX_SIZE = 50000000;
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -1762689436731160661L;
- private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
- private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
- private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
-
- private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
- private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
- private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
- private boolean tracking = false;
-
- public static Date parseDate(String headerContent) throws ParseException {
- SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
- return dateFormat.parse(headerContent);
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, TikaException, SAXException {
-
- EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
-
- String charsetName = "windows-1252";
-
- metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
- metadata.set(Metadata.CONTENT_ENCODING, charsetName);
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- InputStreamReader isr = new InputStreamReader(stream, charsetName);
- try (BufferedReader reader = new BufferedReader(isr)) {
- String curLine = reader.readLine();
- int mailItem = 0;
- do {
- if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
- Metadata mailMetadata = new Metadata();
- Queue<String> multiline = new LinkedList<String>();
- mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
- mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
- curLine = reader.readLine();
-
- ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
- do {
- if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
- String latestLine = multiline.poll();
- latestLine += " " + curLine.trim();
- multiline.add(latestLine);
- } else {
- multiline.add(curLine);
- }
-
- message.write(curLine.getBytes(charsetName));
- message.write(0x0A);
- curLine = reader.readLine();
- }
- while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
-
- for (String item : multiline) {
- saveHeaderInMetadata(mailMetadata, item);
- }
-
- ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
- message = null;
-
- if (extractor.shouldParseEmbedded(mailMetadata)) {
- extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
- }
-
- if (tracking) {
- getTrackingMetadata().put(mailItem++, mailMetadata);
- }
- } else {
- curLine = reader.readLine();
- }
-
- } while (curLine != null && !Thread.currentThread().isInterrupted());
- }
-
- xhtml.endDocument();
- }
-
- public boolean isTracking() {
- return tracking;
- }
-
- public void setTracking(boolean tracking) {
- this.tracking = tracking;
- }
-
- public Map<Integer, Metadata> getTrackingMetadata() {
- return trackingMetadata;
- }
-
- private void saveHeaderInMetadata(Metadata metadata, String curLine) {
- Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
- if (!headerMatcher.matches()) {
- return; // ignore malformed header lines
- }
-
- String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
- String headerContent = headerMatcher.group(2);
-
- if (headerTag.equalsIgnoreCase("From")) {
- metadata.set(TikaCoreProperties.CREATOR, headerContent);
- } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
- || headerTag.equalsIgnoreCase("Bcc")) {
- Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
- if (address.find()) {
- metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
- } else if (headerContent.indexOf('@') > -1) {
- metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
- }
-
- String property = Metadata.MESSAGE_TO;
- if (headerTag.equalsIgnoreCase("Cc")) {
- property = Metadata.MESSAGE_CC;
- } else if (headerTag.equalsIgnoreCase("Bcc")) {
- property = Metadata.MESSAGE_BCC;
- }
- metadata.add(property, headerContent);
- } else if (headerTag.equalsIgnoreCase("Subject")) {
- metadata.add(Metadata.SUBJECT, headerContent);
- } else if (headerTag.equalsIgnoreCase("Date")) {
- try {
- Date date = parseDate(headerContent);
- metadata.set(TikaCoreProperties.CREATED, date);
- } catch (ParseException e) {
- // ignoring date because format was not understood
- }
- } else if (headerTag.equalsIgnoreCase("Message-Id")) {
- metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
- } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
- metadata.set(TikaCoreProperties.RELATION, headerContent);
- } else if (headerTag.equalsIgnoreCase("Content-Type")) {
- // TODO - key off content-type in headers to
- // set mapping to use for content and convert if necessary.
-
- metadata.add(Metadata.CONTENT_TYPE, headerContent);
- metadata.set(TikaCoreProperties.FORMAT, headerContent);
- } else {
- metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ public static final int MAIL_MAX_SIZE = 50000000;
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1762689436731160661L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+ private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+ private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+ private boolean tracking = false;
+
+ public static Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ String charsetName = "windows-1252";
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ InputStreamReader isr = new InputStreamReader(stream, charsetName);
+ try (BufferedReader reader = new BufferedReader(isr)) {
+ String curLine = reader.readLine();
+ int mailItem = 0;
+ do {
+ if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ Metadata mailMetadata = new Metadata();
+ Queue<String> multiline = new LinkedList<String>();
+ mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ curLine = reader.readLine();
+
+ ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+ do {
+ if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
+
+ message.write(curLine.getBytes(charsetName));
+ message.write(0x0A);
+ curLine = reader.readLine();
+ }
+ while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+ for (String item : multiline) {
+ saveHeaderInMetadata(mailMetadata, item);
+ }
+
+ ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+ message = null;
+
+ if (extractor.shouldParseEmbedded(mailMetadata)) {
+ extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+ }
+
+ if (tracking) {
+ getTrackingMetadata().put(mailItem++, mailMetadata);
+ }
+ } else {
+ curLine = reader.readLine();
+ }
+
+ } while (curLine != null && !Thread.currentThread().isInterrupted());
+ }
+
+ xhtml.endDocument();
+ }
+
+ public boolean isTracking() {
+ return tracking;
+ }
+
+ public void setTracking(boolean tracking) {
+ this.tracking = tracking;
+ }
+
+ public Map<Integer, Metadata> getTrackingMetadata() {
+ return trackingMetadata;
+ }
+
+ private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ return; // ignore malformed header lines
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.set(TikaCoreProperties.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+ || headerTag.equalsIgnoreCase("Bcc")) {
+ Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+ if (address.find()) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+ } else if (headerContent.indexOf('@') > -1) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+ }
+
+ String property = Metadata.MESSAGE_TO;
+ if (headerTag.equalsIgnoreCase("Cc")) {
+ property = Metadata.MESSAGE_CC;
+ } else if (headerTag.equalsIgnoreCase("Bcc")) {
+ property = Metadata.MESSAGE_BCC;
+ }
+ metadata.add(property, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ try {
+ Date date = parseDate(headerContent);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.set(TikaCoreProperties.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.set(TikaCoreProperties.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index f7eec91..5883bd5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -1,203 +1,203 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static java.lang.String.valueOf;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.Collections.singleton;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import com.pff.PSTAttachment;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Parser for MS Outlook PST email storage files
- */
-public class OutlookPSTParser extends AbstractParser {
-
- private static final long serialVersionUID = 620998217748364063L;
-
- public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
- private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
-
- private static AttributesImpl createAttribute(String attName, String attValue) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", attName, attName, "CDATA", attValue);
- return attributes;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- // Use the delegate parser to parse the contained document
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
-
- metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- TikaInputStream in = TikaInputStream.get(stream);
- PSTFile pstFile = null;
- try {
- pstFile = new PSTFile(in.getFile().getPath());
- metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
- boolean isValid = pstFile.getFileHandle().getFD().valid();
- metadata.set("isValid", valueOf(isValid));
- if (isValid) {
- parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
- }
- } catch (Exception e) {
- throw new TikaException(e.getMessage(), e);
- } finally {
- if (pstFile != null && pstFile.getFileHandle() != null) {
- try {
- pstFile.getFileHandle().close();
- } catch (IOException e) {
- //swallow closing exception
- }
- }
- }
-
- xhtml.endDocument();
- }
-
- private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
- throws Exception {
- if (pstFolder.getContentCount() > 0) {
- PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
- while (pstMail != null) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
- handler.startElement("div", attributes);
- handler.element("h1", pstMail.getSubject());
-
- parserMailItem(handler, pstMail, embeddedExtractor);
- parseMailAttachments(handler, pstMail, embeddedExtractor);
-
- handler.endElement("div");
-
- pstMail = (PSTMessage) pstFolder.getNextChild();
- }
- }
-
- if (pstFolder.hasSubfolders()) {
- for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
- handler.startElement("div", createAttribute("class", "email-folder"));
- handler.element("h1", pstSubFolder.getDisplayName());
- parseFolder(handler, pstSubFolder, embeddedExtractor);
- handler.endElement("div");
- }
- }
- }
-
- private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
- Metadata mailMetadata = new Metadata();
- mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
- mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
- mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
- mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
- mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
- mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
- mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
- mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
- mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
- mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
- mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
- mailMetadata.set("recipients", pstMail.getRecipientsString());
- mailMetadata.set("displayTo", pstMail.getDisplayTo());
- mailMetadata.set("displayCC", pstMail.getDisplayCC());
- mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
- mailMetadata.set("importance", valueOf(pstMail.getImportance()));
- mailMetadata.set("priority", valueOf(pstMail.getPriority()));
- mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
-
- byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
- embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
- }
-
- private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
- throws TikaException {
- int numberOfAttachments = email.getNumberOfAttachments();
- for (int i = 0; i < numberOfAttachments; i++) {
- File tempFile = null;
- try {
- PSTAttachment attach = email.getAttachment(i);
-
- // Get the filename; both long and short filenames can be used for attachments
- String filename = attach.getLongFilename();
- if (filename.isEmpty()) {
- filename = attach.getFilename();
- }
-
- xhtml.element("p", filename);
-
- Metadata attachMeta = new Metadata();
- attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
- attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", filename);
- xhtml.startElement("div", attributes);
- if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
- embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
- } finally {
- tmp.dispose();
- }
- }
- xhtml.endElement("div");
-
- } catch (Exception e) {
- throw new TikaException("Unable to unpack document stream", e);
- } finally {
- if (tempFile != null)
- tempFile.delete();
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+ private static final long serialVersionUID = 620998217748364063L;
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+ private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static AttributesImpl createAttribute(String attName, String attValue) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", attName, attName, "CDATA", attValue);
+ return attributes;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TikaInputStream in = TikaInputStream.get(stream);
+ PSTFile pstFile = null;
+ try {
+ pstFile = new PSTFile(in.getFile().getPath());
+ metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+ boolean isValid = pstFile.getFileHandle().getFD().valid();
+ metadata.set("isValid", valueOf(isValid));
+ if (isValid) {
+ parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+ }
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage(), e);
+ } finally {
+ if (pstFile != null && pstFile.getFileHandle() != null) {
+ try {
+ pstFile.getFileHandle().close();
+ } catch (IOException e) {
+ //swallow closing exception
+ }
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+ throws Exception {
+ if (pstFolder.getContentCount() > 0) {
+ PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+ while (pstMail != null) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ parserMailItem(handler, pstMail, embeddedExtractor);
+ parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+ handler.endElement("div");
+
+ pstMail = (PSTMessage) pstFolder.getNextChild();
+ }
+ }
+
+ if (pstFolder.hasSubfolders()) {
+ for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+ handler.startElement("div", createAttribute("class", "email-folder"));
+ handler.element("h1", pstSubFolder.getDisplayName());
+ parseFolder(handler, pstSubFolder, embeddedExtractor);
+ handler.endElement("div");
+ }
+ }
+ }
+
+ private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+ Metadata mailMetadata = new Metadata();
+ mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+ mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+ mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+ mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+ mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+ mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+ mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+ mailMetadata.set("recipients", pstMail.getRecipientsString());
+ mailMetadata.set("displayTo", pstMail.getDisplayTo());
+ mailMetadata.set("displayCC", pstMail.getDisplayCC());
+ mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+ mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+ mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+ mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+ }
+
+ private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+ throws TikaException {
+ int numberOfAttachments = email.getNumberOfAttachments();
+ for (int i = 0; i < numberOfAttachments; i++) {
+ File tempFile = null;
+ try {
+ PSTAttachment attach = email.getAttachment(i);
+
+ // Get the filename; both long and short filenames can be used for attachments
+ String filename = attach.getLongFilename();
+ if (filename.isEmpty()) {
+ filename = attach.getFilename();
+ }
+
+ xhtml.element("p", filename);
+
+ Metadata attachMeta = new Metadata();
+ attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+ attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", filename);
+ xhtml.startElement("div", attributes);
+ if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ xhtml.endElement("div");
+
+ } catch (Exception e) {
+ throw new TikaException("Unable to unpack document stream", e);
+ } finally {
+ if (tempFile != null)
+ tempFile.delete();
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
index 36439b8..fa932a6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
@@ -1,99 +1,99 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that:<ul>
- * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
- * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
- * </ul>
- */
-public class NSNormalizerContentHandler extends ContentHandlerDecorator {
-
- private static final String OLD_NS =
- "http://openoffice.org/2000/";
-
- private static final String NEW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:";
-
- private static final String DTD_PUBLIC_ID =
- "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
-
- public NSNormalizerContentHandler(ContentHandler handler) {
- super(handler);
- }
-
- private String mapOldNS(String ns) {
- if (ns != null && ns.startsWith(OLD_NS)) {
- return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
- } else {
- return ns;
- }
- }
-
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes atts) throws SAXException {
- AttributesImpl natts = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- natts.addAttribute(
- mapOldNS(atts.getURI(i)), atts.getLocalName(i),
- atts.getQName(i), atts.getType(i), atts.getValue(i));
- }
- super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
- }
-
- @Override
- public void endElement(String namespaceURI, String localName, String qName)
- throws SAXException {
- super.endElement(mapOldNS(namespaceURI), localName, qName);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri)
- throws SAXException {
- super.startPrefixMapping(prefix, mapOldNS(uri));
- }
-
- /**
- * do not load any DTDs (may be requested by parser). Fake the DTD by
- * returning a empty string as InputSource
- */
- @Override
- public InputSource resolveEntity(String publicId, String systemId)
- throws IOException, SAXException {
- if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
- || DTD_PUBLIC_ID.equals(publicId)) {
- return new InputSource(new StringReader(""));
- } else {
- return super.resolveEntity(publicId, systemId);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+ private static final String OLD_NS =
+ "http://openoffice.org/2000/";
+
+ private static final String NEW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:";
+
+ private static final String DTD_PUBLIC_ID =
+ "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private String mapOldNS(String ns) {
+ if (ns != null && ns.startsWith(OLD_NS)) {
+ return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+ } else {
+ return ns;
+ }
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+ atts.getQName(i), atts.getType(i), atts.getValue(i));
+ }
+ super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ super.endElement(mapOldNS(namespaceURI), localName, qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ super.startPrefixMapping(prefix, mapOldNS(uri));
+ }
+
+ /**
+ * do not load any DTDs (may be requested by parser). Fake the DTD by
+ * returning a empty string as InputSource
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId)
+ throws IOException, SAXException {
+ if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+ || DTD_PUBLIC_ID.equals(publicId)) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId, systemId);
+ }
+ }
+
+}