You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [28/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,54 @@
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Abstract class for recognizing a single charset.
+ * Part of the implementation of ICU's CharsetDetector.
+ *
+ * Each specific charset that can be recognized will have an instance
+ * of some subclass of this class. All interaction between the overall
+ * CharsetDetector and the stuff specific to an individual charset happens
+ * via the interface provided here.
+ *
+ * Instances of CharsetDetector DO NOT have or maintain
+ * state pertaining to a specific match or detect operation.
+ * The WILL be shared by multiple instances of CharsetDetector.
+ * They encapsulate const charset-specific information.
+ *
+ * @internal
+ */
+abstract class CharsetRecognizer {
+ /**
+ * Get the IANA name of this charset.
+ * @return the charset name.
+ */
+ abstract String getName();
+
+ /**
+ * Get the ISO language code for this charset.
+ * @return the language code, or <code>null</code> if the language cannot be determined.
+ */
+ public String getLanguage() {
+ return null;
+ }
+
+ /**
+ * Test the match of this charset with the input text data
+ * which is obtained via the CharsetDetector object.
+ *
+ * @param det The CharsetDetector, which contains the input text
+ * to be checked for being in this charset.
+ * @return Two values packed into one int (Damn java, anyhow)
+ * <br/>
+ * bits 0-7: the match confidence, ranging from 0-100
+ * <br/>
+ * bits 8-15: The match reason, an enum-like value.
+ */
+ abstract int match(CharsetDetector det);
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+
+public class Icu4jEncodingDetector implements EncodingDetector {
+
+ public Charset detect(InputStream input, Metadata metadata)
+ throws IOException {
+ if (input == null) {
+ return null;
+ }
+
+ CharsetDetector detector = new CharsetDetector();
+
+ String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+ if (incomingCharset == null && incomingType != null) {
+ // TIKA-341: Use charset in content-type
+ MediaType mt = MediaType.parse(incomingType);
+ if (mt != null) {
+ incomingCharset = mt.getParameters().get("charset");
+ }
+ }
+
+ if (incomingCharset != null) {
+ String cleaned = CharsetUtils.clean(incomingCharset);
+ if (cleaned != null) {
+ detector.setDeclaredEncoding(cleaned);
+ } else {
+ // TODO: log a warning?
+ }
+ }
+
+ // TIKA-341 without enabling input filtering (stripping of tags)
+ // short HTML tests don't work well
+ detector.enableInputFilter(true);
+
+ detector.setText(input);
+
+ for (CharsetMatch match : detector.detectAll()) {
+ try {
+ return CharsetUtils.forName(match.getName());
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ return null;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Plain text parser. The text encoding of the document stream is
+ * automatically detected based on the byte patterns found at the
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
+ * <p/>
+ * This parser sets the following output metadata entries:
+ * <dl>
+ * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
+ * <dd><code>text/plain; charset=...</code></dd>
+ * </dl>
+ */
+public class TXTParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6656102320836888910L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.TEXT_PLAIN);
+
+ private static final ServiceLoader LOADER =
+ new ServiceLoader(TXTParser.class.getClassLoader());
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata,
+ context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ xhtml.startElement("p");
+ char[] buffer = new char[4096];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ xhtml.characters(buffer, 0, n);
+ n = reader.read(buffer);
+ }
+ xhtml.endElement("p");
+
+ xhtml.endDocument();
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class UniversalEncodingDetector implements EncodingDetector {
+
+ private static final int BUFSIZE = 1024;
+
+ private static final int LOOKAHEAD = 16 * BUFSIZE;
+
+ public Charset detect(InputStream input, Metadata metadata)
+ throws IOException {
+ if (input == null) {
+ return null;
+ }
+
+ input.mark(LOOKAHEAD);
+ try {
+ UniversalEncodingListener listener =
+ new UniversalEncodingListener(metadata);
+
+ byte[] b = new byte[BUFSIZE];
+ int n = 0;
+ int m = input.read(b);
+ while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
+ n += m;
+ listener.handleData(b, 0, m);
+ m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
+ }
+
+ return listener.dataEnd();
+ } catch (LinkageError e) {
+ return null; // juniversalchardet is not available
+ } finally {
+ input.reset();
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.nio.charset.Charset;
+
+import org.apache.tika.detect.TextStatistics;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+import org.mozilla.universalchardet.CharsetListener;
+import org.mozilla.universalchardet.Constants;
+import org.mozilla.universalchardet.UniversalDetector;
+
+/**
+ * Helper class used by {@link UniversalEncodingDetector} to access the
+ * <code>juniversalchardet</code> detection logic.
+ */
+class UniversalEncodingListener implements CharsetListener {
+
+ private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
+
+ private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
+
+ private final TextStatistics statistics = new TextStatistics();
+
+ private final UniversalDetector detector = new UniversalDetector(this);
+
+ private String hint = null;
+
+ private Charset charset = null;
+
+ public UniversalEncodingListener(Metadata metadata) {
+ MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ if (type != null) {
+ hint = type.getParameters().get("charset");
+ }
+ if (hint == null) {
+ hint = metadata.get(Metadata.CONTENT_ENCODING);
+ }
+ }
+
+ public void report(String name) {
+ if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
+ if (hint != null) {
+ // Use the encoding hint when available
+ name = hint;
+ } else if (statistics.count('\r') == 0) {
+ // If there are no CR(LF)s, then the encoding is more
+ // likely to be ISO-8859-1(5) than windows-1252
+ if (statistics.count(0xa4) > 0) { // currency/euro sign
+ // The general currency sign is hardly ever used in
+ // ISO-8859-1, so it's more likely that we're dealing
+ // with ISO-8859-15, where the character is used for
+ // the euro symbol, which is more commonly used.
+ name = CHARSET_ISO_8859_15;
+ } else {
+ name = CHARSET_ISO_8859_1;
+ }
+ }
+ }
+ try {
+ this.charset = CharsetUtils.forName(name);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ public boolean isDone() {
+ return detector.isDone();
+ }
+
+ public void handleData(byte[] buf, int offset, int length) {
+ statistics.addData(buf, offset, length);
+ detector.handleData(buf, offset, length);
+ }
+
+ public Charset dataEnd() {
+ detector.dataEnd();
+ if (charset == null && statistics.isMostlyAscii()) {
+ report(Constants.CHARSET_WINDOWS_1252);
+ }
+ return charset;
+ }
+
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+ private final Property property;
+ private final String name;
+
+ protected AbstractMetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ protected AbstractMetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ /**
+ * Adds the given metadata value. The value is ignored if it is
+ * <code>null</code> or empty. If the metadata entry already exists,
+ * then the given value is appended to it with a comma as the separator.
+ *
+ * @param value metadata value
+ */
+ protected void addMetadata(String value) {
+ if (value != null && value.length() > 0) {
+ if (metadata.isMultiValued(name)) {
+ // Add the value, assuming it's not already there
+ List<String> previous = Arrays.asList(metadata.getValues(name));
+ if (!previous.contains(value)) {
+ if (property != null) {
+ metadata.add(property, value);
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ // Set the value, assuming it's not already there
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ if (!previous.equals(value)) {
+ if (property != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ // Replace the existing value if isMultiValuePermitted is false
+ metadata.set(property, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ if (property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ * value, and the Metadata name is taken from
+ * an attribute, with a prefix if required.
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final String nameHoldingAttribute;
+ private final String namePrefix;
+ private String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+ this.metadata = metadata;
+ this.nameHoldingAttribute = nameHoldingAttribute;
+ this.namePrefix = namePrefix;
+ }
+
+ public void addMetadata(String value) {
+ if(name == null || name.length() == 0) {
+ // We didn't find the attribute which holds the name
+ return;
+ }
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+ metadata.set(name, value);
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ String rawName = attributes.getValue(nameHoldingAttribute);
+ if (rawName != null) {
+ if (namePrefix == null) {
+ this.name = rawName;
+ } else {
+ this.name = namePrefix + rawName;
+ }
+ }
+ // All other attributes are ignored
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+ private final String uri;
+
+ private final String localName;
+
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ }
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, Property property) {
+ super(metadata, property);
+ this.uri = uri;
+ this.localName = localName;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ if (attributes.getURI(i).equals(this.uri)
+ && attributes.getLocalName(i).equals(this.localName)) {
+ addMetadata(attributes.getValue(i).trim());
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 4905318835463880819L;
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+ /**
+ * Logger for this class
+ */
+ private static final Log logger = LogFactory
+ .getLog(ElementMetadataHandler.class);
+
+ private static final String LOCAL_NAME_RDF_BAG = "Bag";
+ private static final String LOCAL_NAME_RDF_LI = "li";
+ private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+ private final String uri;
+
+ private final String localName;
+
+ private final Metadata metadata;
+
+ private final String name;
+ private Property targetProperty;
+
+ private final boolean allowDuplicateValues;
+ private final boolean allowEmptyValues;
+
+ /**
+ * The buffer used to capture characters when inside a bag li element.
+ */
+ private final StringBuilder bufferBagged = new StringBuilder();
+
+ /**
+ * The buffer used to capture characters inside standard elements.
+ */
+ private final StringBuilder bufferBagless = new StringBuilder();
+
+ /**
+ * Whether or not the value was found in a standard element structure or inside a bag.
+ */
+ private boolean isBagless = true;
+
+ private int matchLevel = 0;
+ private int parentMatchLevel = 0;
+
+ /**
+ * Constructor for string metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created simple handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for string metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created simple handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for Property metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created property handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for Property metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created property handler for " + this.name);
+ }
+ }
+
+ protected boolean isMatchingParentElement(String uri, String localName) {
+ return (uri.equals(this.uri) && localName.equals(this.localName));
+ }
+
+ protected boolean isMatchingElement(String uri, String localName) {
+ // match if we're inside the parent element or within some bag element
+ return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+ (parentMatchLevel > 0 &&
+ ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+ (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+ )
+ );
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ if (isMatchingElement(uri, localName)) {
+ matchLevel++;
+ }
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) {
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel--;
+ }
+ if (isMatchingElement(uri, localName)) {
+ matchLevel--;
+ if (matchLevel == 2) {
+ // we're inside a bag li element, add the bagged buffer
+ addMetadata(bufferBagged.toString().trim());
+ bufferBagged.setLength(0);
+ isBagless = false;
+ }
+ if (matchLevel == 0 && isBagless) {
+ String valueBagless = bufferBagless.toString();
+ if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+ // we're in a standard element, add the bagless buffer
+ addMetadata(valueBagless.trim());
+ bufferBagless.setLength(0);
+ }
+ isBagless = true;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ // We need to append to both buffers since we don't if we're inside a bag until we're done
+ if (parentMatchLevel > 0 && matchLevel > 2) {
+ bufferBagged.append(ch, start, length);
+ }
+ if (parentMatchLevel > 0 && matchLevel > 0) {
+ bufferBagless.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) {
+ characters(ch, start, length);
+ }
+
+ @Override
+ protected void addMetadata(String value) {
+ if (logger.isTraceEnabled()) {
+ logger.trace("adding " + name + "=" + value);
+ }
+ if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+ if ((value != null && value.length() > 0) || allowEmptyValues) {
+ if (value == null || value.length() == 0 && allowEmptyValues) {
+ value = "";
+ }
+ String[] previous = metadata.getValues(name);
+ if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+ metadata.add(targetProperty, value);
+ }
+ }
+ } else {
+ super.addMetadata(value);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+ private static final long serialVersionUID = 4195954546491524374L;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.application("x-fictionbook+xml"));
+ }
+
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ ex = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ return new BinaryElementsDataHandler(ex, handler);
+ }
+
+ private static class BinaryElementsDataHandler extends DefaultHandler {
+ private static final String ELEMENT_BINARY = "binary";
+
+ private boolean binaryMode = false;
+ private static final String ATTRIBUTE_ID = "id";
+
+ private final EmbeddedDocumentExtractor partExtractor;
+ private final ContentHandler handler;
+ private final StringBuilder binaryData = new StringBuilder();
+ private Metadata metadata;
+ private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+ private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+ this.partExtractor = partExtractor;
+ this.handler = handler;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ binaryMode = ELEMENT_BINARY.equals(localName);
+ if (binaryMode) {
+ binaryData.setLength(0);
+ metadata = new Metadata();
+
+ metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+ metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (binaryMode) {
+ try {
+ partExtractor.parseEmbedded(
+ new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+ handler,
+ metadata,
+ true
+ );
+ } catch (IOException e) {
+ throw new SAXException("IOException in parseEmbedded", e);
+ }
+
+ binaryMode = false;
+ binaryData.setLength(0);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (!binaryMode) {
+ handler.characters(ch, start, length);
+ } else {
+ binaryData.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ handler.ignorableWhitespace(ch, start, length);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ * the textual content of a node (if present), and
+ * all attribute values passed through the matcher
+ * (but not their names).
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ * {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final Property property;
+ private final String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public MetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ public MetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ public void addMetadata(String value) {
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+
+ if (this.property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ addMetadata(attributes.getValue(i));
+ }
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -6028836725280212837L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("xml"),
+ MediaType.image("svg+xml"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+ metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+ }
+
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
+ try {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))));
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TextContentHandler(handler, true);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector Wed Jan 6 03:50:50 2016
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.txt.UniversalEncodingDetector
+org.apache.tika.parser.txt.Icu4jEncodingDetector
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.txt.TXTParser
+#org.apache.tika.parser.strings.Latin1StringsParser
+#org.apache.tika.parser.strings.StringsParser
+org.apache.tika.parser.xml.DcXMLParser
+org.apache.tika.parser.xml.FictionBookParser
+#org.apache.tika.parser.xml.XMLParser
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/FileConfigTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,28 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class FileConfigTest {
+
+ @Test
+ public void testNoConfig() {
+ FileConfig config = new FileConfig();
+ assertEquals("Invalid default filePath value", "", config.getFilePath());
+ assertEquals("Invalid default mime option value", false, config.isMimetype());
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/Latin1StringsParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,69 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.charset.StandardCharsets.UTF_16;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class Latin1StringsParserTest {
+
+ @Test
+ public void testParse() throws Exception {
+
+ String testStr = "These are Latin1 accented scripts: \u00C2 \u00C3 \u00C9 \u00DC \u00E2 \u00E3 \u00E9 \u00FC";
+ String smallStr = "ab";
+
+ byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
+ byte[] utf8Bytes = testStr.getBytes(UTF_8);
+ byte[] utf16Bytes = testStr.getBytes(UTF_16);
+ byte[] zeros = new byte[10];
+ byte[] smallString = smallStr.getBytes(ISO_8859_1);
+ byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ baos.write(iso8859Bytes);
+ baos.write(zeros);
+ baos.write(utf8Bytes);
+ baos.write(trashBytes);
+ baos.write(utf16Bytes);
+ baos.write(zeros);
+ baos.write(smallString);
+
+ Parser parser = new Latin1StringsParser();
+ ContentHandler handler = new BodyContentHandler();
+
+ try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
+ parser.parse(stream, handler, new Metadata(), new ParseContext());
+ }
+
+ String result = handler.toString();
+ String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
+
+ // Test if result contains only the test string appended 3 times
+ assertTrue(result.equals(expected));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,61 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.InputStream;
+
+import org.junit.Test;
+
+public class StringsConfigTest {
+
+ @Test
+ public void testNoConfig() {
+ StringsConfig config = new StringsConfig();
+ assertEquals("Invalid default filePath value", "", config.getStringsPath());
+ assertEquals("Invalid default encoding value", StringsEncoding.SINGLE_7_BIT, config.getEncoding());
+ assertEquals("Invalid default min-len value", 4, config.getMinLength());
+ assertEquals("Invalid default timeout value", 120, config.getTimeout());
+ }
+
+ @Test
+ public void testPartialConfig() {
+ InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties");
+
+ StringsConfig config = new StringsConfig(stream);
+ assertEquals("Invalid default stringsPath value", "", config.getStringsPath());
+ assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+ assertEquals("Invalid default min-len value", 4, config.getMinLength());
+ assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+ }
+
+ @Test
+ public void testFullConfig() {
+ InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties");
+
+ StringsConfig config = new StringsConfig(stream);
+ assertEquals("Invalid overridden stringsPath value", "/opt/strings" + File.separator, config.getStringsPath());
+ assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+ assertEquals("Invalid overridden min-len value", 3, config.getMinLength());
+ assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidateEconding() {
+ StringsConfig config = new StringsConfig();
+ config.setMinLength(0);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,74 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import static org.apache.tika.parser.strings.StringsParser.getStringsProg;
+import static org.junit.Assert.*;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class StringsParserTest {
+ public static boolean canRun() {
+ StringsConfig config = new StringsConfig();
+ String[] checkCmd = {config.getStringsPath() + getStringsProg(), "--version"};
+ boolean hasStrings = ExternalParser.check(checkCmd);
+ return hasStrings;
+ }
+
+ @Test
+ public void testParse() throws Exception {
+ assumeTrue(canRun());
+
+ String resource = "/test-documents/testOCTET_header.dbase3";
+
+ String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" };
+
+ String[] met_attributes = {"min-len", "encoding", "strings:file_output"};
+
+ StringsConfig stringsConfig = new StringsConfig();
+ FileConfig fileConfig = new FileConfig();
+
+ Parser parser = new StringsParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext context = new ParseContext();
+ context.set(StringsConfig.class, stringsConfig);
+ context.set(FileConfig.class, fileConfig);
+
+ try (InputStream stream = StringsParserTest.class.getResourceAsStream(resource)) {
+ parser.parse(stream, handler, metadata, context);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ // Content
+ for (String word : content) {
+ assertTrue(handler.toString().contains(word));
+ }
+
+ // Metadata
+ Arrays.equals(met_attributes, metadata.names());
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.junit.Test;
+
+public class CharsetDetectorTest {
+
+ @Test
+ public void testTagDropper() throws IOException {
+ try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.enableInputFilter(true);
+ detector.setText(in);
+ CharsetMatch[] matches = detector.detectAll();
+ CharsetMatch mm = null;
+ for (CharsetMatch m : matches) {
+ if (mm == null || mm.getConfidence() < m.getConfidence()) {
+ mm = m;
+ }
+ }
+ assertTrue(mm != null);
+ assertEquals("UTF-8", mm.getName());
+ }
+ }
+
+ /* https://issues.apache.org/jira/browse/TIKA-1248
+ * Verify empty or null declaredEncoding doesn't cause an exception
+ *
+ */
+
+ @Test
+ public void testEmptyOrNullDeclaredCharset() throws IOException {
+ try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ CharsetDetector detector = new CharsetDetector();
+ Reader reader = detector.getReader(in, null);
+ assertTrue(reader.ready());
+
+ reader = detector.getReader(in, "");
+ assertTrue(reader.ready());
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TXTParserTest {
+
+ private Parser parser = new TXTParser();
+
+ @Test
+ public void testEnglishText() throws Exception {
+ String text =
+ "Hello, World! This is simple UTF-8 text content written"
+ + " in English to test autodetection of both the character"
+ + " encoding and the language of the input stream.";
+
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+ String content = writer.toString();
+
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+ // TIKA-501: Remove language detection from TXTParser
+ assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
+ assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
+
+ assertContains("Hello", content);
+ assertContains("World", content);
+ assertContains("autodetection", content);
+ assertContains("stream", content);
+ }
+
+ @Test
+ public void testUTF8Text() throws Exception {
+ String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(UTF_8)),
+ handler, metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ assertContains(text, handler.toString());
+ }
+
+ @Test
+ public void testEmptyText() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("\n", handler.toString());
+ }
+
+ /**
+ * Test for the heuristics that we use to assign an eight-bit character
+ * encoding to mostly ASCII sequences. If a more specific match can not
+ * be made, a string with a CR(LF) in it is most probably windows-1252,
+ * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+ * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+ */
+ @Test
+ public void testLatinDetectionHeuristics() throws Exception {
+ String windows = "test\r\n";
+ String unix = "test\n";
+ String euro = "test \u20ac\n";
+
+ Metadata metadata;
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=windows-1252",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-1",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-15",
+ metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Test case for TIKA-240: Drop the BOM when extracting plain text
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
+ */
+ @Test
+ public void testDropByteOrderMark() throws Exception {
+ assertExtractText("UTF-8 BOM", "test", new byte[]{
+ (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
+ assertExtractText("UTF-16 BE BOM", "test", new byte[]{
+ (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
+ assertExtractText("UTF-16 LE BOM", "test", new byte[]{
+ (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
+ }
+
+ /**
+ * Test case for TIKA-335: using incoming charset
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ @Test
+ public void testUseIncomingCharsetAsHint() throws Exception {
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+ }
+
+ /**
+ * Test case for TIKA-341: using charset in content-type
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+ */
+ @Test
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+ }
+
+ private void assertExtractText(String msg, String expected, byte[] input)
+ throws Exception {
+ ContentHandler handler = new BodyContentHandler() {
+ public void ignorableWhitespace(char[] ch, int off, int len) {
+ // Ignore the whitespace added by XHTMLContentHandler
+ }
+ };
+ Metadata metadata = new Metadata();
+ parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
+ assertEquals(msg, expected, handler.toString());
+ }
+
+ /**
+ * Test case for TIKA-339: don't override incoming language
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ @Test
+ public void testRetainIncomingLanguage() throws Exception {
+ final String test = "Simple Content";
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.LANGUAGE, "en");
+
+ parser.parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+ }
+
+ @Test
+ public void testCP866() throws Exception {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+
+ assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testEBCDIC_CP500() throws Exception {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+
+ assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
+
+ // Additional check that it isn't too eager on short blocks of text
+ metadata = new Metadata();
+ writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+ */
+ @Test
+ public void testCharsetDetectionWithShortSnipet() throws Exception {
+ final String text = "Hello, World!";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+ // Now verify that if we tell the parser the encoding is UTF-8, that's what
+ // we get back (see TIKA-868)
+ metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+}