You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [28/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,54 @@
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Abstract class for recognizing a single charset.
+ * Part of the implementation of ICU's CharsetDetector.
+ *
+ * Each specific charset that can be recognized will have an instance
+ * of some subclass of this class.  All interaction between the overall
+ * CharsetDetector and the stuff specific to an individual charset happens
+ * via the interface provided here.
+ *
+ * Instances of CharsetDetector DO NOT have or maintain 
+ * state pertaining to a specific match or detect operation.
+ * The WILL be shared by multiple instances of CharsetDetector.
+ * They encapsulate const charset-specific information.
+ *
+ * @internal
+ */
+abstract class CharsetRecognizer {
+    /**
+     * Get the IANA name of this charset.
+     * @return the charset name.
+     */
+    abstract String getName();
+
+    /**
+     * Get the ISO language code for this charset.
+     * @return the language code, or <code>null</code> if the language cannot be determined.
+     */
+    public String getLanguage() {
+        return null;
+    }
+
+    /**
+     * Test the match of this charset with the input text data
+     *      which is obtained via the CharsetDetector object.
+     *
+     * @param det  The CharsetDetector, which contains the input text
+     *             to be checked for being in this charset.
+     * @return Two values packed into one int  (Damn java, anyhow)
+     *             <br/>
+     *             bits 0-7:  the match confidence, ranging from 0-100
+     *             <br/>
+     *             bits 8-15: The match reason, an enum-like value.
+     */
+    abstract int match(CharsetDetector det);
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+
+public class Icu4jEncodingDetector implements EncodingDetector {
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        CharsetDetector detector = new CharsetDetector();
+
+        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
+            // TIKA-341: Use charset in content-type
+            MediaType mt = MediaType.parse(incomingType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
+            }
+        }
+
+        if (incomingCharset != null) {
+            String cleaned = CharsetUtils.clean(incomingCharset);
+            if (cleaned != null) {
+                detector.setDeclaredEncoding(cleaned);
+            } else {
+                // TODO: log a warning?
+            }
+        }
+
+        // TIKA-341 without enabling input filtering (stripping of tags)
+        // short HTML tests don't work well
+        detector.enableInputFilter(true);
+
+        detector.setText(input);
+
+        for (CharsetMatch match : detector.detectAll()) {
+            try {
+                return CharsetUtils.forName(match.getName());
+            } catch (Exception e) {
+                // ignore
+            }
+        }
+
+        return null;
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Plain text parser. The text encoding of the document stream is
+ * automatically detected based on the byte patterns found at the
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
+ * <p/>
+ * This parser sets the following output metadata entries:
+ * <dl>
+ * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
+ * <dd><code>text/plain; charset=...</code></dd>
+ * </dl>
+ */
+public class TXTParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -6656102320836888910L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.TEXT_PLAIN);
+
+    private static final ServiceLoader LOADER =
+            new ServiceLoader(TXTParser.class.getClassLoader());
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(
+                new CloseShieldInputStream(stream), metadata,
+                context.get(ServiceLoader.class, LOADER))) {
+            Charset charset = reader.getCharset();
+            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            XHTMLContentHandler xhtml =
+                    new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+            xhtml.startElement("p");
+            char[] buffer = new char[4096];
+            int n = reader.read(buffer);
+            while (n != -1) {
+                xhtml.characters(buffer, 0, n);
+                n = reader.read(buffer);
+            }
+            xhtml.endElement("p");
+
+            xhtml.endDocument();
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class UniversalEncodingDetector implements EncodingDetector {
+
+    private static final int BUFSIZE = 1024;
+
+    private static final int LOOKAHEAD = 16 * BUFSIZE;
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        input.mark(LOOKAHEAD);
+        try {
+            UniversalEncodingListener listener =
+                    new UniversalEncodingListener(metadata);
+
+            byte[] b = new byte[BUFSIZE];
+            int n = 0;
+            int m = input.read(b);
+            while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
+                n += m;
+                listener.handleData(b, 0, m);
+                m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
+            }
+
+            return listener.dataEnd();
+        } catch (LinkageError e) {
+            return null; // juniversalchardet is not available
+        } finally {
+            input.reset();
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.TextStatistics;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+import org.mozilla.universalchardet.CharsetListener;
+import org.mozilla.universalchardet.Constants;
+import org.mozilla.universalchardet.UniversalDetector;
+
+/**
+ * Helper class used by {@link UniversalEncodingDetector} to access the
+ * <code>juniversalchardet</code> detection logic.
+ */
+class UniversalEncodingListener implements CharsetListener {
+
+    private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
+
+    private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
+
+    private final TextStatistics statistics = new TextStatistics();
+
+    private final UniversalDetector detector = new UniversalDetector(this);
+
+    private String hint = null;
+
+    private Charset charset = null;
+
+    public UniversalEncodingListener(Metadata metadata) {
+        MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+        if (type != null) {
+            hint = type.getParameters().get("charset");
+        }
+        if (hint == null) {
+            hint = metadata.get(Metadata.CONTENT_ENCODING);
+        }
+    }
+
+    public void report(String name) {
+        if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
+            if (hint != null) {
+                // Use the encoding hint when available
+                name = hint;
+            } else if (statistics.count('\r') == 0) {
+                // If there are no CR(LF)s, then the encoding is more
+                // likely to be ISO-8859-1(5) than windows-1252
+                if (statistics.count(0xa4) > 0) { // currency/euro sign
+                    // The general currency sign is hardly ever used in
+                    // ISO-8859-1, so it's more likely that we're dealing
+                    // with ISO-8859-15, where the character is used for
+                    // the euro symbol, which is more commonly used.
+                    name = CHARSET_ISO_8859_15;
+                } else {
+                    name = CHARSET_ISO_8859_1;
+                }
+            }
+        }
+        try {
+            this.charset = CharsetUtils.forName(name);
+        } catch (Exception e) {
+            // ignore
+        }
+    }
+
+    public boolean isDone() {
+        return detector.isDone();
+    }
+
+    public void handleData(byte[] buf, int offset, int length) {
+        statistics.addData(buf, offset, length);
+        detector.handleData(buf, offset, length);
+    }
+
+    public Charset dataEnd() {
+        detector.dataEnd();
+        if (charset == null && statistics.isMostlyAscii()) {
+            report(Constants.CHARSET_WINDOWS_1252);
+        }
+        return charset;
+    }
+
+}
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+    private final Property property;
+    private final String name;
+
+    protected AbstractMetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    protected AbstractMetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    /**
+     * Adds the given metadata value. The value is ignored if it is
+     * <code>null</code> or empty. If the metadata entry already exists,
+     * then the given value is appended to it with a comma as the separator.
+     *
+     * @param value metadata value
+     */
+    protected void addMetadata(String value) {
+        if (value != null && value.length() > 0) {
+            if (metadata.isMultiValued(name)) {
+                // Add the value, assuming it's not already there
+                List<String> previous = Arrays.asList(metadata.getValues(name));
+                if (!previous.contains(value)) {
+                    if (property != null) {
+                       metadata.add(property, value);
+                    } else {
+                       metadata.add(name, value);
+                    }
+                }
+            } else {
+                // Set the value, assuming it's not already there
+                String previous = metadata.get(name);
+                if (previous != null && previous.length() > 0) {
+                    if (!previous.equals(value)) {
+                       if (property != null) {
+                          if (property.isMultiValuePermitted()) {
+                              metadata.add(property, value);
+                          } else {
+                              // Replace the existing value if isMultiValuePermitted is false
+                              metadata.set(property, value);
+                          }
+                       } else {
+                          metadata.add(name, value);
+                       }
+                    }
+                } else {
+                   if (property != null) {
+                      metadata.set(property, value);
+                   } else {
+                      metadata.set(name, value);
+                   }
+                }
+            }
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ *  value, and the Metadata name is taken from
+ *  an attribute, with a prefix if required. 
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final String nameHoldingAttribute;
+    private final String namePrefix;
+    private String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+        this.metadata = metadata;
+        this.nameHoldingAttribute = nameHoldingAttribute;
+        this.namePrefix = namePrefix;
+    }
+
+    public void addMetadata(String value) {
+        if(name == null || name.length() == 0) {
+           // We didn't find the attribute which holds the name
+           return;
+        }
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            metadata.set(name, value);
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        String rawName = attributes.getValue(nameHoldingAttribute);
+        if (rawName != null) {
+           if (namePrefix == null) {
+              this.name = rawName;
+           } else {
+              this.name = namePrefix + rawName;
+           }
+        }
+        // All other attributes are ignored
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+    private final String uri;
+
+    private final String localName;
+
+    public AttributeMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+    }
+    public AttributeMetadataHandler(
+          String uri, String localName, Metadata metadata, Property property) {
+      super(metadata, property);
+      this.uri = uri;
+      this.localName = localName;
+  }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            if (attributes.getURI(i).equals(this.uri)
+                    && attributes.getLocalName(i).equals(this.localName)) {
+                addMetadata(attributes.getValue(i).trim());
+            }
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 4905318835463880819L;
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TeeContentHandler(
+                super.getContentHandler(handler, metadata, context),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+    /**
+     * Logger for this class
+     */
+    private static final Log logger = LogFactory
+            .getLog(ElementMetadataHandler.class);
+
+    private static final String LOCAL_NAME_RDF_BAG = "Bag";
+    private static final String LOCAL_NAME_RDF_LI = "li";
+    private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    private final String uri;
+
+    private final String localName;
+
+    private final Metadata metadata;
+
+    private final String name;
+    private Property targetProperty;
+
+    private final boolean allowDuplicateValues;
+    private final boolean allowEmptyValues;
+
+    /**
+     * The buffer used to capture characters when inside a bag li element.
+     */
+    private final StringBuilder bufferBagged = new StringBuilder();
+
+    /**
+     * The buffer used to capture characters inside standard elements.
+     */
+    private final StringBuilder bufferBagless = new StringBuilder();
+
+    /**
+     * Whether or not the value was found in a standard element structure or inside a bag.
+     */
+    private boolean isBagless = true;
+
+    private int matchLevel = 0;
+    private int parentMatchLevel = 0;
+
+    /**
+     * Constructor for string metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        if (logger.isTraceEnabled()) {
+            logger.trace("created simple handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for string metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created simple handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for Property metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        if (logger.isTraceEnabled()) {
+            logger.trace("created property handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for Property metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created property handler for " + this.name);
+        }
+    }
+
+    protected boolean isMatchingParentElement(String uri, String localName) {
+        return (uri.equals(this.uri) && localName.equals(this.localName));
+    }
+
+    protected boolean isMatchingElement(String uri, String localName) {
+        // match if we're inside the parent element or within some bag element
+        return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+                (parentMatchLevel > 0 &&
+                        ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+                        (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+                )
+        );
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        if (isMatchingElement(uri, localName)) {
+            matchLevel++;
+        }
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel++;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name) {
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel--;
+        }
+        if (isMatchingElement(uri, localName)) {
+            matchLevel--;
+            if (matchLevel == 2) {
+                // we're inside a bag li element, add the bagged buffer
+                addMetadata(bufferBagged.toString().trim());
+                bufferBagged.setLength(0);
+                isBagless = false;
+            }
+            if (matchLevel == 0 && isBagless) {
+                String valueBagless = bufferBagless.toString();
+                if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+                    // we're in a standard element, add the bagless buffer
+                    addMetadata(valueBagless.trim());
+                    bufferBagless.setLength(0);
+                }
+                isBagless = true;
+            }
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) {
+        // We need to append to both buffers since we don't if we're inside a bag until we're done
+        if (parentMatchLevel > 0 && matchLevel > 2) {
+            bufferBagged.append(ch, start, length);
+        }
+        if (parentMatchLevel > 0 && matchLevel > 0) {
+            bufferBagless.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) {
+        characters(ch, start, length);
+    }
+
+    @Override
+    protected void addMetadata(String value) {
+        if (logger.isTraceEnabled()) {
+            logger.trace("adding " + name + "=" + value);
+        }
+        if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+            if ((value != null && value.length() > 0) || allowEmptyValues) {
+                if (value == null || value.length() == 0 && allowEmptyValues) {
+                    value = "";
+                }
+                String[] previous = metadata.getValues(name);
+                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+                    metadata.add(targetProperty, value);
+                }
+            }
+        } else {
+            super.addMetadata(value);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+    private static final long serialVersionUID = 4195954546491524374L;
+    
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(MediaType.application("x-fictionbook+xml"));
+    }
+
+    @Override
+    protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex == null) {
+            ex = new ParsingEmbeddedDocumentExtractor(context);
+        }
+
+        return new BinaryElementsDataHandler(ex, handler);
+    }
+
+    private static class BinaryElementsDataHandler extends DefaultHandler {
+        private static final String ELEMENT_BINARY = "binary";
+
+        private boolean binaryMode = false;
+        private static final String ATTRIBUTE_ID = "id";
+
+        private final EmbeddedDocumentExtractor partExtractor;
+        private final ContentHandler handler;
+        private final StringBuilder binaryData = new StringBuilder();
+        private Metadata metadata;
+        private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+        private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+            this.partExtractor = partExtractor;
+            this.handler = handler;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+            binaryMode = ELEMENT_BINARY.equals(localName);
+            if (binaryMode) {
+                binaryData.setLength(0);
+                metadata = new Metadata();
+
+                metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+                metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if (binaryMode) {
+                try {
+                    partExtractor.parseEmbedded(
+                            new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+                            handler,
+                            metadata,
+                            true
+                    );
+                } catch (IOException e) {
+                    throw new SAXException("IOException in parseEmbedded", e);
+                }
+
+                binaryMode = false;
+                binaryData.setLength(0);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (!binaryMode) {
+                handler.characters(ch, start, length);
+            } else {
+                binaryData.append(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+            handler.ignorableWhitespace(ch, start, length);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ *  the textual content of a node (if present), and 
+ *  all attribute values passed through the matcher
+ *  (but not their names). 
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ *             {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final Property property;
+    private final String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public MetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    public MetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    public void addMetadata(String value) {
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            
+            if (this.property != null) {
+               metadata.set(property, value);
+            } else {
+               metadata.set(name, value);
+            }
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            addMetadata(attributes.getValue(i));
+        }
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -6028836725280212837L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("xml"),
+                MediaType.image("svg+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+        }
+
+        final XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.startElement("p");
+
+        TaggedContentHandler tagged = new TaggedContentHandler(handler);
+        try {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            getContentHandler(tagged, metadata, context))));
+        } catch (SAXException e) {
+            tagged.throwIfCauseOf(e);
+            throw new TikaException("XML parse error", e);
+        } finally {
+            xhtml.endElement("p");
+            xhtml.endDocument();
+        }
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TextContentHandler(handler, true);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector Wed Jan  6 03:50:50 2016
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.txt.UniversalEncodingDetector
+org.apache.tika.parser.txt.Icu4jEncodingDetector

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan  6 03:50:50 2016
@@ -0,0 +1,22 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.txt.TXTParser
+#org.apache.tika.parser.strings.Latin1StringsParser
+#org.apache.tika.parser.strings.StringsParser
+org.apache.tika.parser.xml.DcXMLParser
+org.apache.tika.parser.xml.FictionBookParser
+#org.apache.tika.parser.xml.XMLParser

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,28 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class FileConfigTest {
+
+	@Test
+	public void testNoConfig() {
+		FileConfig config = new FileConfig();
+		assertEquals("Invalid default filePath value", "", config.getFilePath());
+		assertEquals("Invalid default mime option value", false, config.isMimetype());
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,69 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.charset.StandardCharsets.UTF_16;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class Latin1StringsParserTest {
+
+    @Test
+    public void testParse() throws Exception {
+
+        String testStr = "These are Latin1 accented scripts: \u00C2 \u00C3 \u00C9 \u00DC \u00E2 \u00E3 \u00E9 \u00FC";
+        String smallStr = "ab";
+
+        byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
+        byte[] utf8Bytes = testStr.getBytes(UTF_8);
+        byte[] utf16Bytes = testStr.getBytes(UTF_16);
+        byte[] zeros = new byte[10];
+        byte[] smallString = smallStr.getBytes(ISO_8859_1);
+        byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        baos.write(iso8859Bytes);
+        baos.write(zeros);
+        baos.write(utf8Bytes);
+        baos.write(trashBytes);
+        baos.write(utf16Bytes);
+        baos.write(zeros);
+        baos.write(smallString);
+
+        Parser parser = new Latin1StringsParser();
+        ContentHandler handler = new BodyContentHandler();
+
+        try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
+            parser.parse(stream, handler, new Metadata(), new ParseContext());
+        }
+
+        String result = handler.toString();
+        String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
+
+        // Test if result contains only the test string appended 3 times
+        assertTrue(result.equals(expected));
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,61 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.InputStream;
+
+import org.junit.Test;
+
+public class StringsConfigTest {
+
+	@Test
+	public void testNoConfig() {
+		StringsConfig config = new StringsConfig();
+		assertEquals("Invalid default filePath value", "", config.getStringsPath());
+		assertEquals("Invalid default encoding value", StringsEncoding.SINGLE_7_BIT, config.getEncoding());
+		assertEquals("Invalid default min-len value", 4, config.getMinLength());
+		assertEquals("Invalid default timeout value", 120, config.getTimeout());
+	}
+	
+	@Test
+	public void testPartialConfig() {
+		InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties");
+		
+		StringsConfig config = new StringsConfig(stream);
+		assertEquals("Invalid default stringsPath value", "", config.getStringsPath());
+		assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+		assertEquals("Invalid default min-len value", 4, config.getMinLength());
+		assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+	}
+	
+	@Test
+	public void testFullConfig() {
+		InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties");
+		
+		StringsConfig config = new StringsConfig(stream);
+		assertEquals("Invalid overridden stringsPath value", "/opt/strings" + File.separator, config.getStringsPath());
+		assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+		assertEquals("Invalid overridden min-len value", 3, config.getMinLength());
+		assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+	}
+	
+	@Test(expected=IllegalArgumentException.class)
+	public void testValidateEconding() {
+		StringsConfig config = new StringsConfig();
+		config.setMinLength(0);
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,74 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.apache.tika.parser.strings.StringsParser.getStringsProg;
+import static org.junit.Assert.*;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class StringsParserTest {
+	public static boolean canRun() {
+		StringsConfig config = new StringsConfig();
+		String[] checkCmd = {config.getStringsPath() + getStringsProg(), "--version"};
+		boolean hasStrings = ExternalParser.check(checkCmd);
+		return hasStrings;
+	}
+
+	@Test
+	public void testParse() throws Exception {
+		assumeTrue(canRun());
+		
+		String resource = "/test-documents/testOCTET_header.dbase3";
+
+		String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" };
+		
+		String[] met_attributes = {"min-len", "encoding", "strings:file_output"};
+
+		StringsConfig stringsConfig = new StringsConfig();
+		FileConfig fileConfig = new FileConfig();
+
+		Parser parser = new StringsParser();
+		ContentHandler handler = new BodyContentHandler();
+		Metadata metadata = new Metadata();
+
+		ParseContext context = new ParseContext();
+		context.set(StringsConfig.class, stringsConfig);
+		context.set(FileConfig.class, fileConfig);
+
+		try (InputStream stream = StringsParserTest.class.getResourceAsStream(resource)) {
+			parser.parse(stream, handler, metadata, context);
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+
+		// Content
+		for (String word : content) {
+			assertTrue(handler.toString().contains(word));
+		}
+		
+		// Metadata
+		Arrays.equals(met_attributes, metadata.names());
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.junit.Test;
+
+public class CharsetDetectorTest {
+
+    @Test
+    public void testTagDropper() throws IOException {
+        try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+            CharsetDetector detector = new CharsetDetector();
+            detector.enableInputFilter(true);
+            detector.setText(in);
+            CharsetMatch[] matches = detector.detectAll();
+            CharsetMatch mm = null;
+            for (CharsetMatch m : matches) {
+                if (mm == null || mm.getConfidence() < m.getConfidence()) {
+                    mm = m;
+                }
+            }
+            assertTrue(mm != null);
+            assertEquals("UTF-8", mm.getName());
+        }
+    }
+  
+  /* https://issues.apache.org/jira/browse/TIKA-1248
+   * Verify empty or null declaredEncoding doesn't cause an exception
+   * 
+   */
+
+    @Test
+    public void testEmptyOrNullDeclaredCharset() throws IOException {
+        try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+            CharsetDetector detector = new CharsetDetector();
+            Reader reader = detector.getReader(in, null);
+            assertTrue(reader.ready());
+
+            reader = detector.getReader(in, "");
+            assertTrue(reader.ready());
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TXTParserTest {
+
+    private Parser parser = new TXTParser();
+
+    @Test
+    public void testEnglishText() throws Exception {
+        String text =
+                "Hello, World! This is simple UTF-8 text content written"
+                        + " in English to test autodetection of both the character"
+                        + " encoding and the language of the input stream.";
+
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+        String content = writer.toString();
+
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+        // TIKA-501: Remove language detection from TXTParser
+        assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
+        assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
+
+        assertContains("Hello", content);
+        assertContains("World", content);
+        assertContains("autodetection", content);
+        assertContains("stream", content);
+    }
+
+    @Test
+    public void testUTF8Text() throws Exception {
+        String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(UTF_8)),
+                handler, metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        assertContains(text, handler.toString());
+    }
+
+    @Test
+    public void testEmptyText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("\n", handler.toString());
+    }
+
+    /**
+     * Test for the heuristics that we use to assign an eight-bit character
+     * encoding to mostly ASCII sequences. If a more specific match can not
+     * be made, a string with a CR(LF) in it is most probably windows-1252,
+     * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+     * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+     */
+    @Test
+    public void testLatinDetectionHeuristics() throws Exception {
+        String windows = "test\r\n";
+        String unix = "test\n";
+        String euro = "test \u20ac\n";
+
+        Metadata metadata;
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=windows-1252",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-1",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-15",
+                metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    /**
+     * Test case for TIKA-240: Drop the BOM when extracting plain text
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
+     */
+    @Test
+    public void testDropByteOrderMark() throws Exception {
+        assertExtractText("UTF-8 BOM", "test", new byte[]{
+                (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
+        assertExtractText("UTF-16 BE BOM", "test", new byte[]{
+                (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
+        assertExtractText("UTF-16 LE BOM", "test", new byte[]{
+                (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
+    }
+
+    /**
+     * Test case for TIKA-335: using incoming charset
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+     */
+    @Test
+    public void testUseIncomingCharsetAsHint() throws Exception {
+        // Could be ISO 8859-1 or ISO 8859-15 or ...
+        // u00e1 is latin small letter a with acute
+        final String test2 = "the name is \u00e1ndre";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+    }
+
+    /**
+     * Test case for TIKA-341: using charset in content-type
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+     */
+    @Test
+    public void testUsingCharsetInContentTypeHeader() throws Exception {
+        // Could be ISO 8859-1 or ISO 8859-15 or ...
+        // u00e1 is latin small letter a with acute
+        final String test2 = "the name is \u00e1ndre";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+    }
+
+    private void assertExtractText(String msg, String expected, byte[] input)
+            throws Exception {
+        ContentHandler handler = new BodyContentHandler() {
+            public void ignorableWhitespace(char[] ch, int off, int len) {
+                // Ignore the whitespace added by XHTMLContentHandler
+            }
+        };
+        Metadata metadata = new Metadata();
+        parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
+        assertEquals(msg, expected, handler.toString());
+    }
+
+    /**
+     * Test case for TIKA-339: don't override incoming language
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+     */
+    @Test
+    public void testRetainIncomingLanguage() throws Exception {
+        final String test = "Simple Content";
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.LANGUAGE, "en");
+
+        parser.parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+    }
+
+    @Test
+    public void testCP866() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testEBCDIC_CP500() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
+
+        // Additional check that it isn't too eager on short blocks of text
+        metadata = new Metadata();
+        writer = new StringWriter();
+        parser.parse(
+                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    /**
+     * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+     */
+    @Test
+    public void testCharsetDetectionWithShortSnipet() throws Exception {
+        final String text = "Hello, World!";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+        // Now verify that if we tell the parser the encoding is UTF-8, that's what
+        // we get back (see TIKA-868)
+        metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+}