You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/02 16:46:48 UTC
svn commit: r991956 [2/6] - in /tika/trunk: src/site/apt/ tika-core/src/main/java/org/apache/tika/ tika-core/src/main/java/org/apache/tika/detect/ tika-core/src/main/java/org/apache/tika/metadata/ tika-core/src/main/java/org/apache/tika/parser/ tika-co...

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Thu Sep  2 14:46:46 2010
@@ -1,382 +1,382 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.net.URL;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParsingReader;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Facade class for accessing Tika functionality. This class hides much of
- * the underlying complexity of the lower level Tika classes and provides
- * simple methods for many common parsing and type detection operations.
- *
- * @since Apache Tika 0.5
- * @see Parser
- * @see Detector
- */
-public class Tika {
-
-    /**
-     * The detector instance used by this facade.
-     */
-    private final Detector detector;
-
-    /**
-     * The parser instance used by this facade.
-     */
-    private final Parser parser;
-
-    /**
-     * Maximum length of the strings returned by the parseToString methods.
-     * Used to prevent out of memory problems with huge input documents.
-     * The default setting is 100k characters.
-     */
-    private int maxStringLength = 100 * 1000;
-
-    /**
-     * Creates a Tika facade using the given detector and parser instances.
-     *
-     * @since Apache Tika 0.8
-     * @param detector type detector
-     * @param parser document parser
-     */
-    public Tika(Detector detector, Parser parser) {
-        this.detector = detector;
-        this.parser = parser;
-    }
-
-    /**
-     * Creates a Tika facade using the given configuration.
-     *
-     * @param config Tika configuration
-     */
-    public Tika(TikaConfig config) {
-        this(config.getMimeRepository(), new AutoDetectParser(config));
-    }
-
-    /**
-     * Creates a Tika facade using the default configuration.
-     */
-    public Tika() {
-        this(TikaConfig.getDefaultConfig());
-    }
-
-    /**
-     * Creates a Tika facade using the given detector instance and the
-     * default parser configuration.
-     *
-     * @since Apache Tika 0.8
-     * @param detector type detector
-     */
-    public Tika(Detector detector) {
-        this(detector, new AutoDetectParser(detector));
-    }
-
-    
-    /**
-     * Detects the media type of the given document. The type detection is
-     * based on the content of the given document stream and any given
-     * document metadata. The document stream can be <code>null</code>,
-     * in which case only the given document metadata is used for type
-     * detection.
-     * <p>
-     * If the document stream supports the
-     * {@link InputStream#markSupported() mark feature}, then the stream is
-     * marked and reset to the original position before this method returns.
-     * Only a limited number of bytes are read from the stream.
-     * <p>
-     * The given document stream is <em>not</em> closed by this method.
-     * <p>
-     * Unlike in the {@link #parse(InputStream, Metadata)} method, the
-     * given document metadata is <em>not</em> modified by this method.
-     *
-     * @param stream the document stream, or <code>null</code>
-     * @param metadata document metadata
-     * @return detected media type
-     * @throws IOException if the stream can not be read
-     */
-    public String detect(InputStream stream, Metadata metadata)
-            throws IOException {
-        if (stream == null || stream.markSupported()) {
-            return detector.detect(stream, metadata).toString();
-        } else {
-            return detector.detect(
-                    new BufferedInputStream(stream), metadata).toString();
-        }
-    }
-
-    /**
-     * Detects the media type of the given document. The type detection is
-     * based on the content of the given document stream.
-     * <p>
-     * If the document stream supports the
-     * {@link InputStream#markSupported() mark feature}, then the stream is
-     * marked and reset to the original position before this method returns.
-     * Only a limited number of bytes are read from the stream.
-     * <p>
-     * The given document stream is <em>not</em> closed by this method.
-     *
-     * @param stream the document stream
-     * @return detected media type
-     * @throws IOException if the stream can not be read
-     */
-    public String detect(InputStream stream) throws IOException {
-        return detect(stream, new Metadata());
-    }
-
-    /**
-     * Detects the media type of the given file. The type detection is
-     * based on the document content and a potential known file extension.
-     * <p>
-     * Use the {@link #detect(String)} method when you want to detect the
-     * type of the document without actually accessing the file.
-     *
-     * @param file the file
-     * @return detected media type
-     * @throws IOException if the file can not be read
-     */
-    public String detect(File file) throws IOException {
-        return detect(file.toURI().toURL());
-    }
-
-    /**
-     * Detects the media type of the resource at the given URL. The type
-     * detection is based on the document content and a potential known
-     * file extension included in the URL.
-     * <p>
-     * Use the {@link #detect(String)} method when you want to detect the
-     * type of the document without actually accessing the URL.
-     *
-     * @param url the URL of the resource
-     * @return detected media type
-     * @throws IOException if the resource can not be read
-     */
-    public String detect(URL url) throws IOException {
-        Metadata metadata = new Metadata();
-        InputStream stream = TikaInputStream.get(url, metadata);
-        try {
-            return detect(stream, metadata);
-        } finally {
-            stream.close();
-        }
-    }
-
-    /**
-     * Detects the media type of a document with the given file name.
-     * The type detection is based on known file name extensions.
-     * <p>
-     * The given name can also be a URL or a full file path. In such cases
-     * only the file name part of the string is used for type detection. 
-     *
-     * @param name the file name of the document
-     * @return detected media type
-     */
-    public String detect(String name) {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
-        try {
-            return detect(null, metadata);
-        } catch (IOException e) {
-            throw new IllegalStateException("Unexpected IOException", e);
-        }
-    }
-
-    /**
-     * Parses the given document and returns the extracted text content.
-     * Input metadata like a file name or a content type hint can be passed
-     * in the given metadata instance. Metadata information extracted from
-     * the document is returned in that same metadata instance.
-     *
-     * @param stream the document to be parsed
-     * @return extracted text content
-     * @throws IOException if the document can not be read or parsed
-     */
-    public Reader parse(InputStream stream, Metadata metadata)
-            throws IOException {
-        ParseContext context = new ParseContext();
-        context.set(Parser.class, parser);
-        return new ParsingReader(parser, stream, metadata, context);
-    }
-
-    /**
-     * Parses the given document and returns the extracted text content.
-     *
-     * @param stream the document to be parsed
-     * @return extracted text content
-     * @throws IOException if the document can not be read or parsed
-     */
-    public Reader parse(InputStream stream) throws IOException {
-        return parse(stream, new Metadata());
-    }
-
-    /**
-     * Parses the given file and returns the extracted text content.
-     *
-     * @param file the file to be parsed
-     * @return extracted text content
-     * @throws IOException if the file can not be read or parsed
-     */
-    public Reader parse(File file) throws IOException {
-        return parse(file.toURI().toURL());
-    }
-
-    /**
-     * Parses the resource at the given URL and returns the extracted
-     * text content.
-     *
-     * @param url the URL of the resource to be parsed
-     * @return extracted text content
-     * @throws IOException if the resource can not be read or parsed
-     */
-    public Reader parse(URL url) throws IOException {
-        Metadata metadata = new Metadata();
-        InputStream stream = TikaInputStream.get(url, metadata);
-        return parse(stream, metadata);
-    }
-
-    /**
-     * Parses the given document and returns the extracted text content.
-     * The given input stream is closed by this method.
-     * <p>
-     * To avoid unpredictable excess memory use, the returned string contains
-     * only up to {@link #getMaxStringLength()} first characters extracted
-     * from the input document. Use the {@link #setMaxStringLength(int)}
-     * method to adjust this limitation.
-     *
-     * @param stream the document to be parsed
-     * @param metadata document metadata
-     * @return extracted text content
-     * @throws IOException if the document can not be read
-     * @throws TikaException if the document can not be parsed
-     */
-    public String parseToString(InputStream stream, Metadata metadata)
-            throws IOException, TikaException {
-        WriteOutContentHandler handler =
-            new WriteOutContentHandler(maxStringLength);
-        try {
-            ParseContext context = new ParseContext();
-            context.set(Parser.class, parser);
-            parser.parse(
-                    stream, new BodyContentHandler(handler), metadata, context);
-        } catch (SAXException e) {
-            if (!handler.isWriteLimitReached(e)) {
-                // This should never happen with BodyContentHandler...
-                throw new TikaException("Unexpected SAX processing failure", e);
-            }
-        } finally {
-            stream.close();
-        }
-        return handler.toString();
-    }
-
-    /**
-     * Parses the given document and returns the extracted text content.
-     * The given input stream is closed by this method.
-     * <p>
-     * To avoid unpredictable excess memory use, the returned string contains
-     * only up to {@link #getMaxStringLength()} first characters extracted
-     * from the input document. Use the {@link #setMaxStringLength(int)}
-     * method to adjust this limitation.
-     *
-     * @param stream the document to be parsed
-     * @return extracted text content
-     * @throws IOException if the document can not be read
-     * @throws TikaException if the document can not be parsed
-     */
-    public String parseToString(InputStream stream)
-            throws IOException, TikaException {
-        return parseToString(stream, new Metadata());
-    }
-
-    /**
-     * Parses the given file and returns the extracted text content.
-     * <p>
-     * To avoid unpredictable excess memory use, the returned string contains
-     * only up to {@link #getMaxStringLength()} first characters extracted
-     * from the input document. Use the {@link #setMaxStringLength(int)}
-     * method to adjust this limitation.
-     *
-     * @param file the file to be parsed
-     * @return extracted text content
-     * @throws IOException if the file can not be read
-     * @throws TikaException if the file can not be parsed
-     */
-    public String parseToString(File file) throws IOException, TikaException {
-        return parseToString(file.toURI().toURL());
-    }
-
-    /**
-     * Parses the resource at the given URL and returns the extracted
-     * text content.
-     * <p>
-     * To avoid unpredictable excess memory use, the returned string contains
-     * only up to {@link #getMaxStringLength()} first characters extracted
-     * from the input document. Use the {@link #setMaxStringLength(int)}
-     * method to adjust this limitation.
-     *
-     * @param url the URL of the resource to be parsed
-     * @return extracted text content
-     * @throws IOException if the resource can not be read
-     * @throws TikaException if the resource can not be parsed
-     */
-    public String parseToString(URL url) throws IOException, TikaException {
-        Metadata metadata = new Metadata();
-        InputStream stream = TikaInputStream.get(url, metadata);
-        return parseToString(stream, metadata);
-    }
-
-    /**
-     * Returns the maximum length of strings returned by the
-     * parseToString methods.
-     *
-     * @since Apache Tika 0.7
-     * @return maximum string length, or -1 if the limit has been disabled
-     */
-    public int getMaxStringLength() {
-        return maxStringLength;
-    }
-
-    /**
-     * Sets the maximum length of strings returned by the parseToString
-     * methods.
-     *
-     * @since Apache Tika 0.7
-     * @param maxStringLength maximum string length,
-     *                        or -1 to disable this limit
-     */
-    public void setMaxStringLength(int maxStringLength) {
-        this.maxStringLength = maxStringLength;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.URL;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Facade class for accessing Tika functionality. This class hides much of
+ * the underlying complexity of the lower level Tika classes and provides
+ * simple methods for many common parsing and type detection operations.
+ *
+ * @since Apache Tika 0.5
+ * @see Parser
+ * @see Detector
+ */
+public class Tika {
+
+    /**
+     * The detector instance used by this facade.
+     */
+    private final Detector detector;
+
+    /**
+     * The parser instance used by this facade.
+     */
+    private final Parser parser;
+
+    /**
+     * Maximum length of the strings returned by the parseToString methods.
+     * Used to prevent out of memory problems with huge input documents.
+     * The default setting is 100k characters.
+     */
+    private int maxStringLength = 100 * 1000;
+
+    /**
+     * Creates a Tika facade using the given detector and parser instances.
+     *
+     * @since Apache Tika 0.8
+     * @param detector type detector
+     * @param parser document parser
+     */
+    public Tika(Detector detector, Parser parser) {
+        this.detector = detector;
+        this.parser = parser;
+    }
+
+    /**
+     * Creates a Tika facade using the given configuration.
+     *
+     * @param config Tika configuration
+     */
+    public Tika(TikaConfig config) {
+        this(config.getMimeRepository(), new AutoDetectParser(config));
+    }
+
+    /**
+     * Creates a Tika facade using the default configuration.
+     */
+    public Tika() {
+        this(TikaConfig.getDefaultConfig());
+    }
+
+    /**
+     * Creates a Tika facade using the given detector instance and the
+     * default parser configuration.
+     *
+     * @since Apache Tika 0.8
+     * @param detector type detector
+     */
+    public Tika(Detector detector) {
+        this(detector, new AutoDetectParser(detector));
+    }
+
+    
+    /**
+     * Detects the media type of the given document. The type detection is
+     * based on the content of the given document stream and any given
+     * document metadata. The document stream can be <code>null</code>,
+     * in which case only the given document metadata is used for type
+     * detection.
+     * <p>
+     * If the document stream supports the
+     * {@link InputStream#markSupported() mark feature}, then the stream is
+     * marked and reset to the original position before this method returns.
+     * Only a limited number of bytes are read from the stream.
+     * <p>
+     * The given document stream is <em>not</em> closed by this method.
+     * <p>
+     * Unlike in the {@link #parse(InputStream, Metadata)} method, the
+     * given document metadata is <em>not</em> modified by this method.
+     *
+     * @param stream the document stream, or <code>null</code>
+     * @param metadata document metadata
+     * @return detected media type
+     * @throws IOException if the stream can not be read
+     */
+    public String detect(InputStream stream, Metadata metadata)
+            throws IOException {
+        if (stream == null || stream.markSupported()) {
+            return detector.detect(stream, metadata).toString();
+        } else {
+            return detector.detect(
+                    new BufferedInputStream(stream), metadata).toString();
+        }
+    }
+
+    /**
+     * Detects the media type of the given document. The type detection is
+     * based on the content of the given document stream.
+     * <p>
+     * If the document stream supports the
+     * {@link InputStream#markSupported() mark feature}, then the stream is
+     * marked and reset to the original position before this method returns.
+     * Only a limited number of bytes are read from the stream.
+     * <p>
+     * The given document stream is <em>not</em> closed by this method.
+     *
+     * @param stream the document stream
+     * @return detected media type
+     * @throws IOException if the stream can not be read
+     */
+    public String detect(InputStream stream) throws IOException {
+        return detect(stream, new Metadata());
+    }
+
+    /**
+     * Detects the media type of the given file. The type detection is
+     * based on the document content and a potential known file extension.
+     * <p>
+     * Use the {@link #detect(String)} method when you want to detect the
+     * type of the document without actually accessing the file.
+     *
+     * @param file the file
+     * @return detected media type
+     * @throws IOException if the file can not be read
+     */
+    public String detect(File file) throws IOException {
+        return detect(file.toURI().toURL());
+    }
+
+    /**
+     * Detects the media type of the resource at the given URL. The type
+     * detection is based on the document content and a potential known
+     * file extension included in the URL.
+     * <p>
+     * Use the {@link #detect(String)} method when you want to detect the
+     * type of the document without actually accessing the URL.
+     *
+     * @param url the URL of the resource
+     * @return detected media type
+     * @throws IOException if the resource can not be read
+     */
+    public String detect(URL url) throws IOException {
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(url, metadata);
+        try {
+            return detect(stream, metadata);
+        } finally {
+            stream.close();
+        }
+    }
+
+    /**
+     * Detects the media type of a document with the given file name.
+     * The type detection is based on known file name extensions.
+     * <p>
+     * The given name can also be a URL or a full file path. In such cases
+     * only the file name part of the string is used for type detection. 
+     *
+     * @param name the file name of the document
+     * @return detected media type
+     */
+    public String detect(String name) {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        try {
+            return detect(null, metadata);
+        } catch (IOException e) {
+            throw new IllegalStateException("Unexpected IOException", e);
+        }
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     * Input metadata like a file name or a content type hint can be passed
+     * in the given metadata instance. Metadata information extracted from
+     * the document is returned in that same metadata instance.
+     *
+     * @param stream the document to be parsed
+     * @return extracted text content
+     * @throws IOException if the document can not be read or parsed
+     */
+    public Reader parse(InputStream stream, Metadata metadata)
+            throws IOException {
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+        return new ParsingReader(parser, stream, metadata, context);
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     *
+     * @param stream the document to be parsed
+     * @return extracted text content
+     * @throws IOException if the document can not be read or parsed
+     */
+    public Reader parse(InputStream stream) throws IOException {
+        return parse(stream, new Metadata());
+    }
+
+    /**
+     * Parses the given file and returns the extracted text content.
+     *
+     * @param file the file to be parsed
+     * @return extracted text content
+     * @throws IOException if the file can not be read or parsed
+     */
+    public Reader parse(File file) throws IOException {
+        return parse(file.toURI().toURL());
+    }
+
+    /**
+     * Parses the resource at the given URL and returns the extracted
+     * text content.
+     *
+     * @param url the URL of the resource to be parsed
+     * @return extracted text content
+     * @throws IOException if the resource can not be read or parsed
+     */
+    public Reader parse(URL url) throws IOException {
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(url, metadata);
+        return parse(stream, metadata);
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     * The given input stream is closed by this method.
+     * <p>
+     * To avoid unpredictable excess memory use, the returned string contains
+     * only up to {@link #getMaxStringLength()} first characters extracted
+     * from the input document. Use the {@link #setMaxStringLength(int)}
+     * method to adjust this limitation.
+     *
+     * @param stream the document to be parsed
+     * @param metadata document metadata
+     * @return extracted text content
+     * @throws IOException if the document can not be read
+     * @throws TikaException if the document can not be parsed
+     */
+    public String parseToString(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        WriteOutContentHandler handler =
+            new WriteOutContentHandler(maxStringLength);
+        try {
+            ParseContext context = new ParseContext();
+            context.set(Parser.class, parser);
+            parser.parse(
+                    stream, new BodyContentHandler(handler), metadata, context);
+        } catch (SAXException e) {
+            if (!handler.isWriteLimitReached(e)) {
+                // This should never happen with BodyContentHandler...
+                throw new TikaException("Unexpected SAX processing failure", e);
+            }
+        } finally {
+            stream.close();
+        }
+        return handler.toString();
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     * The given input stream is closed by this method.
+     * <p>
+     * To avoid unpredictable excess memory use, the returned string contains
+     * only up to {@link #getMaxStringLength()} first characters extracted
+     * from the input document. Use the {@link #setMaxStringLength(int)}
+     * method to adjust this limitation.
+     *
+     * @param stream the document to be parsed
+     * @return extracted text content
+     * @throws IOException if the document can not be read
+     * @throws TikaException if the document can not be parsed
+     */
+    public String parseToString(InputStream stream)
+            throws IOException, TikaException {
+        return parseToString(stream, new Metadata());
+    }
+
+    /**
+     * Parses the given file and returns the extracted text content.
+     * <p>
+     * To avoid unpredictable excess memory use, the returned string contains
+     * only up to {@link #getMaxStringLength()} first characters extracted
+     * from the input document. Use the {@link #setMaxStringLength(int)}
+     * method to adjust this limitation.
+     *
+     * @param file the file to be parsed
+     * @return extracted text content
+     * @throws IOException if the file can not be read
+     * @throws TikaException if the file can not be parsed
+     */
+    public String parseToString(File file) throws IOException, TikaException {
+        return parseToString(file.toURI().toURL());
+    }
+
+    /**
+     * Parses the resource at the given URL and returns the extracted
+     * text content.
+     * <p>
+     * To avoid unpredictable excess memory use, the returned string contains
+     * only up to {@link #getMaxStringLength()} first characters extracted
+     * from the input document. Use the {@link #setMaxStringLength(int)}
+     * method to adjust this limitation.
+     *
+     * @param url the URL of the resource to be parsed
+     * @return extracted text content
+     * @throws IOException if the resource can not be read
+     * @throws TikaException if the resource can not be parsed
+     */
+    public String parseToString(URL url) throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(url, metadata);
+        return parseToString(stream, metadata);
+    }
+
+    /**
+     * Returns the maximum length of strings returned by the
+     * parseToString methods.
+     *
+     * @since Apache Tika 0.7
+     * @return maximum string length, or -1 if the limit has been disabled
+     */
+    public int getMaxStringLength() {
+        return maxStringLength;
+    }
+
+    /**
+     * Sets the maximum length of strings returned by the parseToString
+     * methods.
+     *
+     * @since Apache Tika 0.7
+     * @param maxStringLength maximum string length,
+     *                        or -1 to disable this limit
+     */
+    public void setMaxStringLength(int maxStringLength) {
+        this.maxStringLength = maxStringLength;
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java Thu Sep  2 14:46:46 2010
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
-
-/**
- * Content type detector that combines multiple different detection mechanisms.
- */
-public class CompositeDetector implements Detector {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = 5980683158436430252L;
-
-    private final MediaTypeRegistry registry;
-
-    private final List<Detector> detectors;
-
-    public CompositeDetector(
-            MediaTypeRegistry registry, List<Detector> detectors) {
-        this.registry = registry;
-        this.detectors = detectors;
-    }
-
-    public CompositeDetector(List<Detector> detectors) {
-        this(new MediaTypeRegistry(), detectors);
-    }
-
-    public CompositeDetector(Detector... detectors) {
-        this(Arrays.asList(detectors));
-    }
-
-    public MediaType detect(InputStream input, Metadata metadata)
-            throws IOException { 
-        MediaType type = MediaType.OCTET_STREAM;
-        for (Detector detector : detectors) {
-            MediaType detected = detector.detect(input, metadata);
-            if (registry.isSpecializationOf(detected, type)) {
-                type = detected;
-            }
-        }
-        return type;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+
+/**
+ * Content type detector that combines multiple different detection mechanisms.
+ */
+public class CompositeDetector implements Detector {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 5980683158436430252L;
+
+    private final MediaTypeRegistry registry;
+
+    private final List<Detector> detectors;
+
+    public CompositeDetector(
+            MediaTypeRegistry registry, List<Detector> detectors) {
+        this.registry = registry;
+        this.detectors = detectors;
+    }
+
+    public CompositeDetector(List<Detector> detectors) {
+        this(new MediaTypeRegistry(), detectors);
+    }
+
+    public CompositeDetector(Detector... detectors) {
+        this(Arrays.asList(detectors));
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException { 
+        MediaType type = MediaType.OCTET_STREAM;
+        for (Detector detector : detectors) {
+            MediaType detected = detector.detect(input, metadata);
+            if (registry.isSpecializationOf(detected, type)) {
+                type = detected;
+            }
+        }
+        return type;
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java Thu Sep  2 14:46:46 2010
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detector. Implementations of this interface use various
- * heuristics to detect the content type of a document based on given
- * input metadata or the first few bytes of the document stream.
- *
- * @since Apache Tika 0.3
- */
-public interface Detector extends Serializable {
-
-    /**
-     * Detects the content type of the given input document. Returns
-     * <code>application/octet-stream</code> if the type of the document
-     * can not be detected.
-     * <p>
-     * If the document input stream is not available, then the first
-     * argument may be <code>null</code>. Otherwise the detector may
-     * read bytes from the start of the stream to help in type detection.
-     * The given stream is guaranteed to support the
-     * {@link InputStream#markSupported() mark feature} and the detector
-     * is expected to {@link InputStream#mark(int) mark} the stream before
-     * reading any bytes from it, and to {@link InputStream#reset() reset}
-     * the stream before returning. The stream must not be closed by the
-     * detector.
-     * <p>
-     * The given input metadata is only read, not modified, by the detector.
-     *
-     * @param input document input stream, or <code>null</code>
-     * @param metadata input metadata for the document
-     * @return detected media type, or <code>application/octet-stream</code>
-     * @throws IOException if the document input stream could not be read
-     */
-    MediaType detect(InputStream input, Metadata metadata) throws IOException;
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detector. Implementations of this interface use various
+ * heuristics to detect the content type of a document based on given
+ * input metadata or the first few bytes of the document stream.
+ *
+ * @since Apache Tika 0.3
+ */
+public interface Detector extends Serializable {
+
+    /**
+     * Detects the content type of the given input document. Returns
+     * <code>application/octet-stream</code> if the type of the document
+     * can not be detected.
+     * <p>
+     * If the document input stream is not available, then the first
+     * argument may be <code>null</code>. Otherwise the detector may
+     * read bytes from the start of the stream to help in type detection.
+     * The given stream is guaranteed to support the
+     * {@link InputStream#markSupported() mark feature} and the detector
+     * is expected to {@link InputStream#mark(int) mark} the stream before
+     * reading any bytes from it, and to {@link InputStream#reset() reset}
+     * the stream before returning. The stream must not be closed by the
+     * detector.
+     * <p>
+     * The given input metadata is only read, not modified, by the detector.
+     *
+     * @param input document input stream, or <code>null</code>
+     * @param metadata input metadata for the document
+     * @return detected media type, or <code>application/octet-stream</code>
+     * @throws IOException if the document input stream could not be read
+     */
+    MediaType detect(InputStream input, Metadata metadata) throws IOException;
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Thu Sep  2 14:46:46 2010
@@ -1,216 +1,216 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on magic bytes, i.e. type-specific patterns
- * near the beginning of the document input stream.
- *
- * @since Apache Tika 0.3
- */
-public class MagicDetector implements Detector {
-
-    /**
-     * The matching media type. Returned by the
-     * {@link #detect(InputStream, Metadata)} method if a match is found.
-     */
-    private final MediaType type;
-
-    /**
-     * Length of the comparison window. All the byte arrays here are this long.
-     */
-    private final int length;
-
-    /**
-     * The magic match pattern. If this byte pattern is equal to the
-     * possibly bit-masked bytes from the input stream, then the type
-     * detection succeeds and the configured {@link #type} is returned.
-     */
-    private final byte[] pattern;
-
-    /**
-     * Bit mask that is applied to the source bytes before pattern matching.
-     */
-    private final byte[] mask;
-
-    /**
-     * First offset (inclusive) of the comparison window within the
-     * document input stream. Greater than or equal to zero.
-     */
-    private final int offsetRangeBegin;
-
-    /**
-     * Last offset (inclusive) of the comparison window within the document
-     * input stream. Greater than or equal to the
-     * {@link #offsetRangeBegin first offset}.
-     * <p>
-     * Note that this is <em>not</em> the offset of the last byte read from
-     * the document stream. Instead, the last window of bytes to be compared
-     * starts at this offset.
-     */
-    private final int offsetRangeEnd;
-    
-    private final String asString;
-
-    /**
-     * Creates a detector for input documents that have the exact given byte
-     * pattern at the beginning of the document stream.
-     *
-     * @param type matching media type
-     * @param pattern magic match pattern
-     */
-    public MagicDetector(MediaType type, byte[] pattern) {
-        this(type, pattern, 0);
-    }
-
-    /**
-     * Creates a detector for input documents that have the exact given byte
-     * pattern at the given offset of the document stream.
-     *
-     * @param type matching media type
-     * @param pattern magic match pattern
-     * @param offset offset of the pattern match
-     */
-    public MagicDetector(MediaType type, byte[] pattern, int offset) {
-        this(type, pattern, null, offset, offset);
-    }
-
-    /**
-     * Creates a detector for input documents that meet the specified
-     * magic match.
-     */
-    public MagicDetector(
-            MediaType type, byte[] pattern, byte[] mask,
-            int offsetRangeBegin, int offsetRangeEnd) {
-        if (type == null) {
-            throw new IllegalArgumentException("Matching media type is null");
-        } else if (pattern == null) {
-            throw new IllegalArgumentException("Magic match pattern is null");
-        } else if (offsetRangeBegin < 0
-                || offsetRangeEnd < offsetRangeBegin) {
-            throw new IllegalArgumentException(
-                    "Invalid offset range: ["
-                    + offsetRangeBegin + "," + offsetRangeEnd + "]");
-        }
-
-        this.type = type;
-
-        this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
-
-        this.mask = new byte[length];
-        this.pattern = new byte[length];
-
-        for (int i = 0; i < length; i++) {
-            if (mask != null && i < mask.length) {
-                this.mask[i] = mask[i];
-            } else {
-                this.mask[i] = -1;
-            }
-
-            if (i < pattern.length) {
-                this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
-            } else {
-                this.pattern[i] = 0;
-            }
-        }
-
-        this.offsetRangeBegin = offsetRangeBegin;
-        this.offsetRangeEnd = offsetRangeEnd;
-        
-        // Build the string representation. Needs to be unique, as
-        //  these get compared. Compute now as may get compared a lot!
-        this.asString = "Magic Detection for " + type.toString() +
-          " looking for " + pattern.length + 
-          " bytes = " + this.pattern + 
-          " mask = " + this.mask;
-    }
-
-    /**
-     * 
-     * @param input document input stream, or <code>null</code>
-     * @param metadata ignored
-     */
-    public MediaType detect(InputStream input, Metadata metadata)
-            throws IOException {
-        if (input == null) {
-            return MediaType.OCTET_STREAM;
-        }
-
-        input.mark(offsetRangeEnd + length);
-        try {
-            int offset = 0;
-
-            // Skip bytes at the beginning, using skip() or read()
-            while (offset < offsetRangeBegin) {
-                long n = input.skip(offsetRangeBegin - offset);
-                if (n > 0) {
-                    offset += n;
-                } else if (input.read() != -1) {
-                    offset += 1;
-                } else {
-                    return MediaType.OCTET_STREAM;
-                }
-            }
-
-            // Fill in the comparison window
-            byte[] buffer =
-                new byte[length + (offsetRangeEnd - offsetRangeBegin)];
-            int n = input.read(buffer);
-            if (n > 0) {
-                offset += n;
-            }
-            while (n != -1 && offset < offsetRangeEnd + length) {
-                int bufferOffset = offset - offsetRangeBegin;
-                n = input.read(
-                        buffer, bufferOffset, buffer.length - bufferOffset);
-            }
-            if (offset < offsetRangeBegin + length) {
-                return MediaType.OCTET_STREAM;
-            }
-
-            // Loop until we've covered the entire offset range
-            for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
-                boolean match = true;
-                for (int j = 0; match && j < length; j++) {
-                    match = (buffer[i + j] & mask[j]) == pattern[j];
-                }
-                if (match) {
-                    return type;
-                }
-            }
-
-            return MediaType.OCTET_STREAM;
-        } finally {
-            input.reset();
-        }
-    }
-
-    /**
-     * Returns a string representation of the Detection Rule.
-     * Should sort nicely by type and details, as we sometimes
-     *  compare these.
-     */
-    public String toString() {
-       return asString;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on magic bytes, i.e. type-specific patterns
+ * near the beginning of the document input stream.
+ *
+ * @since Apache Tika 0.3
+ */
+public class MagicDetector implements Detector {
+
+    /**
+     * The matching media type. Returned by the
+     * {@link #detect(InputStream, Metadata)} method if a match is found.
+     */
+    private final MediaType type;
+
+    /**
+     * Length of the comparison window. All the byte arrays here are this long.
+     */
+    private final int length;
+
+    /**
+     * The magic match pattern. If this byte pattern is equal to the
+     * possibly bit-masked bytes from the input stream, then the type
+     * detection succeeds and the configured {@link #type} is returned.
+     */
+    private final byte[] pattern;
+
+    /**
+     * Bit mask that is applied to the source bytes before pattern matching.
+     */
+    private final byte[] mask;
+
+    /**
+     * First offset (inclusive) of the comparison window within the
+     * document input stream. Greater than or equal to zero.
+     */
+    private final int offsetRangeBegin;
+
+    /**
+     * Last offset (inclusive) of the comparison window within the document
+     * input stream. Greater than or equal to the
+     * {@link #offsetRangeBegin first offset}.
+     * <p>
+     * Note that this is <em>not</em> the offset of the last byte read from
+     * the document stream. Instead, the last window of bytes to be compared
+     * starts at this offset.
+     */
+    private final int offsetRangeEnd;
+    
+    private final String asString;
+
+    /**
+     * Creates a detector for input documents that have the exact given byte
+     * pattern at the beginning of the document stream.
+     *
+     * @param type matching media type
+     * @param pattern magic match pattern
+     */
+    public MagicDetector(MediaType type, byte[] pattern) {
+        this(type, pattern, 0);
+    }
+
+    /**
+     * Creates a detector for input documents that have the exact given byte
+     * pattern at the given offset of the document stream.
+     *
+     * @param type matching media type
+     * @param pattern magic match pattern
+     * @param offset offset of the pattern match
+     */
+    public MagicDetector(MediaType type, byte[] pattern, int offset) {
+        this(type, pattern, null, offset, offset);
+    }
+
+    /**
+     * Creates a detector for input documents that meet the specified
+     * magic match.
+     */
+    public MagicDetector(
+            MediaType type, byte[] pattern, byte[] mask,
+            int offsetRangeBegin, int offsetRangeEnd) {
+        if (type == null) {
+            throw new IllegalArgumentException("Matching media type is null");
+        } else if (pattern == null) {
+            throw new IllegalArgumentException("Magic match pattern is null");
+        } else if (offsetRangeBegin < 0
+                || offsetRangeEnd < offsetRangeBegin) {
+            throw new IllegalArgumentException(
+                    "Invalid offset range: ["
+                    + offsetRangeBegin + "," + offsetRangeEnd + "]");
+        }
+
+        this.type = type;
+
+        this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
+
+        this.mask = new byte[length];
+        this.pattern = new byte[length];
+
+        for (int i = 0; i < length; i++) {
+            if (mask != null && i < mask.length) {
+                this.mask[i] = mask[i];
+            } else {
+                this.mask[i] = -1;
+            }
+
+            if (i < pattern.length) {
+                this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
+            } else {
+                this.pattern[i] = 0;
+            }
+        }
+
+        this.offsetRangeBegin = offsetRangeBegin;
+        this.offsetRangeEnd = offsetRangeEnd;
+        
+        // Build the string representation. Needs to be unique, as
+        //  these get compared. Compute now as may get compared a lot!
+        this.asString = "Magic Detection for " + type.toString() +
+          " looking for " + pattern.length + 
+          " bytes = " + this.pattern + 
+          " mask = " + this.mask;
+    }
+
+    /**
+     * 
+     * @param input document input stream, or <code>null</code>
+     * @param metadata ignored
+     */
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        input.mark(offsetRangeEnd + length);
+        try {
+            int offset = 0;
+
+            // Skip bytes at the beginning, using skip() or read()
+            while (offset < offsetRangeBegin) {
+                long n = input.skip(offsetRangeBegin - offset);
+                if (n > 0) {
+                    offset += n;
+                } else if (input.read() != -1) {
+                    offset += 1;
+                } else {
+                    return MediaType.OCTET_STREAM;
+                }
+            }
+
+            // Fill in the comparison window
+            byte[] buffer =
+                new byte[length + (offsetRangeEnd - offsetRangeBegin)];
+            int n = input.read(buffer);
+            if (n > 0) {
+                offset += n;
+            }
+            while (n != -1 && offset < offsetRangeEnd + length) {
+                int bufferOffset = offset - offsetRangeBegin;
+                n = input.read(
+                        buffer, bufferOffset, buffer.length - bufferOffset);
+            }
+            if (offset < offsetRangeBegin + length) {
+                return MediaType.OCTET_STREAM;
+            }
+
+            // Loop until we've covered the entire offset range
+            for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
+                boolean match = true;
+                for (int j = 0; match && j < length; j++) {
+                    match = (buffer[i + j] & mask[j]) == pattern[j];
+                }
+                if (match) {
+                    return type;
+                }
+            }
+
+            return MediaType.OCTET_STREAM;
+        } finally {
+            input.reset();
+        }
+    }
+
+    /**
+     * Returns a string representation of the Detection Rule.
+     * Should sort nicely by type and details, as we sometimes
+     *  compare these.
+     */
+    public String toString() {
+       return asString;
+    }
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java Thu Sep  2 14:46:46 2010
@@ -1,143 +1,143 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on the resource name. An instance of this
- * class contains a set of regular expression patterns that are matched
- * against the resource name potentially given as a part of the input metadata.
- * <p>
- * If a pattern matches the given name, then the media type associated with
- * that pattern is returned as the likely content type of the input document.
- * Otherwise the returned type is <code>application/octet-stream</code>.
- * <p>
- * See the {@link #detect(InputStream, Metadata)} method for more details
- * of the matching algorithm.
- *
- * @since Apache Tika 0.3
- */
-public class NameDetector implements Detector {
-
-    /**
-     * The regular expression patterns used for type detection.
-     */
-    private final Map<Pattern, MediaType> patterns;
-
-    /**
-     * Creates a new content type detector based on the given name patterns.
-     * The given pattern map is not copied, so the caller may update the
-     * mappings even after this detector instance has been created. However,
-     * the map <em>must not be concurrently modified</em> while this instance
-     * is used for type detection.
-     *
-     * @param patterns map from name patterns to corresponding media types
-     */
-    public NameDetector(Map<Pattern, MediaType> patterns) {
-        this.patterns = patterns;
-    }
-
-    /**
-     * Detects the content type of an input document based on the document
-     * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
-     * the given input metadata is expected to contain the name (normally
-     * a file name or a URL) of the input document.
-     * <p>
-     * If a resource name is given, then it is first processed as follows.
-     * <ol>
-     *   <li>
-     *     Potential URL query (?...) and fragment identifier (#...)
-     *     parts are removed from the end of the resource name.
-     *   </li>
-     *   <li>
-     *     Potential leading path elements (up to the last slash or backslash)
-     *     are removed from the beginning of the resource name.
-     *   </li>
-     *   <li>
-     *     Potential URL encodings (%nn, in UTF-8) are decoded.
-     *   </li>
-     *   <li>
-     *     Any leading and trailing whitespace is removed.
-     *   </li>
-     * </ol>
-     * <p>
-     * The resulting name string (if any) is then matched in sequence against
-     * all the configured name patterns. If a match is found, then the (first)
-     * matching media type is returned.
-     *
-     * @param input ignored
-     * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
-     * @return detected media type, or <code>application/octet-stream</code>
-     */
-    public MediaType detect(InputStream input, Metadata metadata) {
-        // Look for a resource name in the input metadata
-        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name != null) {
-            // If the name is a URL, skip the trailing query and fragment parts
-            int question = name.indexOf('?');
-            if (question != -1) {
-                name = name.substring(0, question);
-            }
-            int hash = name.indexOf('#');
-            if (hash != -1) {
-                name = name.substring(0, hash);
-            }
-
-            // If the name is a URL or a path, skip all but the last component
-            int slash = name.lastIndexOf('/');
-            if (slash != -1) {
-                name = name.substring(slash + 1);
-            }
-            int backslash = name.lastIndexOf('\\');
-            if (backslash != -1) {
-                name = name.substring(backslash + 1);
-            }
-
-            // Decode any potential URL encoding
-            int percent = name.indexOf('%');
-            if (percent != -1) {
-                try {
-                    name = URLDecoder.decode(name, "UTF-8");
-                } catch (UnsupportedEncodingException e) {
-                    throw new IllegalStateException("UTF-8 not supported", e);
-                }
-            }
-
-            // Skip any leading or trailing whitespace
-            name = name.trim();
-            if (name.length() > 0) {
-                // Match the name against the registered patterns
-                for (Pattern pattern : patterns.keySet()) {
-                    if (pattern.matcher(name).matches()) {
-                        return patterns.get(pattern);
-                    }
-                }
-            }
-        }
-
-        return MediaType.OCTET_STREAM;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on the resource name. An instance of this
+ * class contains a set of regular expression patterns that are matched
+ * against the resource name potentially given as a part of the input metadata.
+ * <p>
+ * If a pattern matches the given name, then the media type associated with
+ * that pattern is returned as the likely content type of the input document.
+ * Otherwise the returned type is <code>application/octet-stream</code>.
+ * <p>
+ * See the {@link #detect(InputStream, Metadata)} method for more details
+ * of the matching algorithm.
+ *
+ * @since Apache Tika 0.3
+ */
+public class NameDetector implements Detector {
+
+    /**
+     * The regular expression patterns used for type detection.
+     */
+    private final Map<Pattern, MediaType> patterns;
+
+    /**
+     * Creates a new content type detector based on the given name patterns.
+     * The given pattern map is not copied, so the caller may update the
+     * mappings even after this detector instance has been created. However,
+     * the map <em>must not be concurrently modified</em> while this instance
+     * is used for type detection.
+     *
+     * @param patterns map from name patterns to corresponding media types
+     */
+    public NameDetector(Map<Pattern, MediaType> patterns) {
+        this.patterns = patterns;
+    }
+
+    /**
+     * Detects the content type of an input document based on the document
+     * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
+     * the given input metadata is expected to contain the name (normally
+     * a file name or a URL) of the input document.
+     * <p>
+     * If a resource name is given, then it is first processed as follows.
+     * <ol>
+     *   <li>
+     *     Potential URL query (?...) and fragment identifier (#...)
+     *     parts are removed from the end of the resource name.
+     *   </li>
+     *   <li>
+     *     Potential leading path elements (up to the last slash or backslash)
+     *     are removed from the beginning of the resource name.
+     *   </li>
+     *   <li>
+     *     Potential URL encodings (%nn, in UTF-8) are decoded.
+     *   </li>
+     *   <li>
+     *     Any leading and trailing whitespace is removed.
+     *   </li>
+     * </ol>
+     * <p>
+     * The resulting name string (if any) is then matched in sequence against
+     * all the configured name patterns. If a match is found, then the (first)
+     * matching media type is returned.
+     *
+     * @param input ignored
+     * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
+     * @return detected media type, or <code>application/octet-stream</code>
+     */
+    public MediaType detect(InputStream input, Metadata metadata) {
+        // Look for a resource name in the input metadata
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name != null) {
+            // If the name is a URL, skip the trailing query and fragment parts
+            int question = name.indexOf('?');
+            if (question != -1) {
+                name = name.substring(0, question);
+            }
+            int hash = name.indexOf('#');
+            if (hash != -1) {
+                name = name.substring(0, hash);
+            }
+
+            // If the name is a URL or a path, skip all but the last component
+            int slash = name.lastIndexOf('/');
+            if (slash != -1) {
+                name = name.substring(slash + 1);
+            }
+            int backslash = name.lastIndexOf('\\');
+            if (backslash != -1) {
+                name = name.substring(backslash + 1);
+            }
+
+            // Decode any potential URL encoding
+            int percent = name.indexOf('%');
+            if (percent != -1) {
+                try {
+                    name = URLDecoder.decode(name, "UTF-8");
+                } catch (UnsupportedEncodingException e) {
+                    throw new IllegalStateException("UTF-8 not supported", e);
+                }
+            }
+
+            // Skip any leading or trailing whitespace
+            name = name.trim();
+            if (name.length() > 0) {
+                // Match the name against the registered patterns
+                for (Pattern pattern : patterns.keySet()) {
+                    if (pattern.matcher(name).matches()) {
+                        return patterns.get(pattern);
+                    }
+                }
+            }
+        }
+
+        return MediaType.OCTET_STREAM;
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Thu Sep  2 14:46:46 2010
@@ -1,115 +1,115 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection of plain text documents. This detector looks at the
- * beginning of the document input stream and considers the document to be
- * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
- * found.
- * <p>
- * Note that text documents with a character encoding like UTF-16 are better
- * detected with {@link MagicDetector} and an appropriate magic byte pattern.
- *
- * @since Apache Tika 0.3
- */
-public class TextDetector implements Detector {
-
-    /**
-     * The number of bytes from the beginning of the document stream
-     * to test for control bytes.
-     */
-    private static final int NUMBER_OF_BYTES_TO_TEST = 512;
-
-    /**
-     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
-     * in the range below 0x20 (the space character). If an entry in this
-     * table is <code>true</code> then that byte is very unlikely to occur
-     * in a plain text document.
-     * <p>
-     * The contents of this lookup table are based on the following definition
-     * from section 4 of the "Content-Type Processing Model" Internet-draft
-     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
-     * >draft-abarth-mime-sniff-01</a>).
-     * <pre>
-     * +-------------------------+
-     * | Binary data byte ranges |
-     * +-------------------------+
-     * | 0x00 -- 0x08            |
-     * | 0x0B                    |
-     * | 0x0E -- 0x1A            |
-     * | 0x1C -- 0x1F            |
-     * +-------------------------+
-     * </pre>
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
-     */
-    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
-
-    static {
-        Arrays.fill(IS_CONTROL_BYTE, true);
-        IS_CONTROL_BYTE[0x09] = false; // tabulator
-        IS_CONTROL_BYTE[0x0A] = false; // new line
-        IS_CONTROL_BYTE[0x0C] = false; // new page
-        IS_CONTROL_BYTE[0x0D] = false; // carriage return
-        IS_CONTROL_BYTE[0x1B] = false; // escape
-    }
-
-    /**
-     * Looks at the beginning of the document input stream to determine
-     * whether the document is text or not.
-     *
-     * @param input document input stream, or <code>null</code>
-     * @param metadata ignored
-     * @return "text/plain" if the input stream suggest a text document,
-     *         "application/octet-stream" otherwise
-     */
-    public MediaType detect(InputStream input, Metadata metadata)
-            throws IOException {
-        if (input == null) {
-            return MediaType.OCTET_STREAM;
-        }
-
-        input.mark(NUMBER_OF_BYTES_TO_TEST);
-        try {
-            for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
-                int ch = input.read();
-                if (ch == -1) {
-                    if (i > 0) {
-                        return MediaType.TEXT_PLAIN;
-                    } else {
-                        // See https://issues.apache.org/jira/browse/TIKA-483
-                        return MediaType.OCTET_STREAM;
-                    }
-                } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
-                    return MediaType.OCTET_STREAM;
-                }
-            }
-            return MediaType.TEXT_PLAIN;
-        } finally {
-            input.reset();
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection of plain text documents. This detector looks at the
+ * beginning of the document input stream and considers the document to be
+ * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
+ * found.
+ * <p>
+ * Note that text documents with a character encoding like UTF-16 are better
+ * detected with {@link MagicDetector} and an appropriate magic byte pattern.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TextDetector implements Detector {
+
+    /**
+     * The number of bytes from the beginning of the document stream
+     * to test for control bytes.
+     */
+    private static final int NUMBER_OF_BYTES_TO_TEST = 512;
+
+    /**
+     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+     * in the range below 0x20 (the space character). If an entry in this
+     * table is <code>true</code> then that byte is very unlikely to occur
+     * in a plain text document.
+     * <p>
+     * The contents of this lookup table are based on the following definition
+     * from section 4 of the "Content-Type Processing Model" Internet-draft
+     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+     * >draft-abarth-mime-sniff-01</a>).
+     * <pre>
+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * </pre>
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+     */
+    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+
+    static {
+        Arrays.fill(IS_CONTROL_BYTE, true);
+        IS_CONTROL_BYTE[0x09] = false; // tabulator
+        IS_CONTROL_BYTE[0x0A] = false; // new line
+        IS_CONTROL_BYTE[0x0C] = false; // new page
+        IS_CONTROL_BYTE[0x0D] = false; // carriage return
+        IS_CONTROL_BYTE[0x1B] = false; // escape
+    }
+
+    /**
+     * Looks at the beginning of the document input stream to determine
+     * whether the document is text or not.
+     *
+     * @param input document input stream, or <code>null</code>
+     * @param metadata ignored
+     * @return "text/plain" if the input stream suggest a text document,
+     *         "application/octet-stream" otherwise
+     */
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        input.mark(NUMBER_OF_BYTES_TO_TEST);
+        try {
+            for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
+                int ch = input.read();
+                if (ch == -1) {
+                    if (i > 0) {
+                        return MediaType.TEXT_PLAIN;
+                    } else {
+                        // See https://issues.apache.org/jira/browse/TIKA-483
+                        return MediaType.OCTET_STREAM;
+                    }
+                } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+                    return MediaType.OCTET_STREAM;
+                }
+            }
+            return MediaType.TEXT_PLAIN;
+        } finally {
+            input.reset();
+        }
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java Thu Sep  2 14:46:46 2010
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on a content type hint. This detector simply
- * trusts any valid content type hint given in the input metadata, and returns
- * that as the likely type of the input document.
- *
- * @since Apache Tika 0.3
- */
-public class TypeDetector implements Detector {
-
-    /**
-     * Detects the content type of an input document based on a type hint
-     * given in the input metadata. The CONTENT_TYPE attribute of the given
-     * input metadata is expected to contain the type of the input document.
-     * If that attribute exists and contains a valid type name, then that
-     * type is returned.
-     *
-     * @param input ignored
-     * @param metadata input metadata, possibly with a CONTENT_TYPE value
-     * @return detected media type, or <code>application/octet-stream</code>
-     */
-    public MediaType detect(InputStream input, Metadata metadata) {
-        // Look for a type hint in the input metadata
-        String hint = metadata.get(Metadata.CONTENT_TYPE);
-        if (hint != null) {
-            MediaType type = MediaType.parse(hint);
-            if (type != null) {
-                return type;
-            }
-        }
-        return MediaType.OCTET_STREAM;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on a content type hint. This detector simply
+ * trusts any valid content type hint given in the input metadata, and returns
+ * that as the likely type of the input document.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TypeDetector implements Detector {
+
+    /**
+     * Detects the content type of an input document based on a type hint
+     * given in the input metadata. The CONTENT_TYPE attribute of the given
+     * input metadata is expected to contain the type of the input document.
+     * If that attribute exists and contains a valid type name, then that
+     * type is returned.
+     *
+     * @param input ignored
+     * @param metadata input metadata, possibly with a CONTENT_TYPE value
+     * @return detected media type, or <code>application/octet-stream</code>
+     */
+    public MediaType detect(InputStream input, Metadata metadata) {
+        // Look for a type hint in the input metadata
+        String hint = metadata.get(Metadata.CONTENT_TYPE);
+        if (hint != null) {
+            MediaType type = MediaType.parse(hint);
+            if (type != null) {
+                return type;
+            }
+        }
+        return MediaType.OCTET_STREAM;
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java Thu Sep  2 14:46:46 2010
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.metadata;
-
-/**
- * XMP Paged-text schema. This is a collection of
- * {@link Property property definition} constants for the paged text
- * properties defined in the XMP standard.
- *
- * @since Apache Tika 0.8
- * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
- *        >XMP Specification, Part 2: Standard Schemas</a>
- */
-public interface PagedText {
-
-    /**
-     * "The number of pages in the document (including any in contained
-     * documents)."
-     */
-    Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * XMP Paged-text schema. This is a collection of
+ * {@link Property property definition} constants for the paged text
+ * properties defined in the XMP standard.
+ *
+ * @since Apache Tika 0.8
+ * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
+ *        >XMP Specification, Part 2: Standard Schemas</a>
+ */
+public interface PagedText {
+
+    /**
+     * "The number of pages in the document (including any in contained
+     * documents)."
+     */
+    Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
------------------------------------------------------------------------------
    svn:eol-style = native