You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/02 16:46:48 UTC
svn commit: r991956 [2/6] - in /tika/trunk: src/site/apt/
tika-core/src/main/java/org/apache/tika/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/metadata/
tika-core/src/main/java/org/apache/tika/parser/ tika-co...
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Thu Sep 2 14:46:46 2010
@@ -1,382 +1,382 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.net.URL;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParsingReader;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Facade class for accessing Tika functionality. This class hides much of
- * the underlying complexity of the lower level Tika classes and provides
- * simple methods for many common parsing and type detection operations.
- *
- * @since Apache Tika 0.5
- * @see Parser
- * @see Detector
- */
-public class Tika {
-
- /**
- * The detector instance used by this facade.
- */
- private final Detector detector;
-
- /**
- * The parser instance used by this facade.
- */
- private final Parser parser;
-
- /**
- * Maximum length of the strings returned by the parseToString methods.
- * Used to prevent out of memory problems with huge input documents.
- * The default setting is 100k characters.
- */
- private int maxStringLength = 100 * 1000;
-
- /**
- * Creates a Tika facade using the given detector and parser instances.
- *
- * @since Apache Tika 0.8
- * @param detector type detector
- * @param parser document parser
- */
- public Tika(Detector detector, Parser parser) {
- this.detector = detector;
- this.parser = parser;
- }
-
- /**
- * Creates a Tika facade using the given configuration.
- *
- * @param config Tika configuration
- */
- public Tika(TikaConfig config) {
- this(config.getMimeRepository(), new AutoDetectParser(config));
- }
-
- /**
- * Creates a Tika facade using the default configuration.
- */
- public Tika() {
- this(TikaConfig.getDefaultConfig());
- }
-
- /**
- * Creates a Tika facade using the given detector instance and the
- * default parser configuration.
- *
- * @since Apache Tika 0.8
- * @param detector type detector
- */
- public Tika(Detector detector) {
- this(detector, new AutoDetectParser(detector));
- }
-
-
- /**
- * Detects the media type of the given document. The type detection is
- * based on the content of the given document stream and any given
- * document metadata. The document stream can be <code>null</code>,
- * in which case only the given document metadata is used for type
- * detection.
- * <p>
- * If the document stream supports the
- * {@link InputStream#markSupported() mark feature}, then the stream is
- * marked and reset to the original position before this method returns.
- * Only a limited number of bytes are read from the stream.
- * <p>
- * The given document stream is <em>not</em> closed by this method.
- * <p>
- * Unlike in the {@link #parse(InputStream, Metadata)} method, the
- * given document metadata is <em>not</em> modified by this method.
- *
- * @param stream the document stream, or <code>null</code>
- * @param metadata document metadata
- * @return detected media type
- * @throws IOException if the stream can not be read
- */
- public String detect(InputStream stream, Metadata metadata)
- throws IOException {
- if (stream == null || stream.markSupported()) {
- return detector.detect(stream, metadata).toString();
- } else {
- return detector.detect(
- new BufferedInputStream(stream), metadata).toString();
- }
- }
-
- /**
- * Detects the media type of the given document. The type detection is
- * based on the content of the given document stream.
- * <p>
- * If the document stream supports the
- * {@link InputStream#markSupported() mark feature}, then the stream is
- * marked and reset to the original position before this method returns.
- * Only a limited number of bytes are read from the stream.
- * <p>
- * The given document stream is <em>not</em> closed by this method.
- *
- * @param stream the document stream
- * @return detected media type
- * @throws IOException if the stream can not be read
- */
- public String detect(InputStream stream) throws IOException {
- return detect(stream, new Metadata());
- }
-
- /**
- * Detects the media type of the given file. The type detection is
- * based on the document content and a potential known file extension.
- * <p>
- * Use the {@link #detect(String)} method when you want to detect the
- * type of the document without actually accessing the file.
- *
- * @param file the file
- * @return detected media type
- * @throws IOException if the file can not be read
- */
- public String detect(File file) throws IOException {
- return detect(file.toURI().toURL());
- }
-
- /**
- * Detects the media type of the resource at the given URL. The type
- * detection is based on the document content and a potential known
- * file extension included in the URL.
- * <p>
- * Use the {@link #detect(String)} method when you want to detect the
- * type of the document without actually accessing the URL.
- *
- * @param url the URL of the resource
- * @return detected media type
- * @throws IOException if the resource can not be read
- */
- public String detect(URL url) throws IOException {
- Metadata metadata = new Metadata();
- InputStream stream = TikaInputStream.get(url, metadata);
- try {
- return detect(stream, metadata);
- } finally {
- stream.close();
- }
- }
-
- /**
- * Detects the media type of a document with the given file name.
- * The type detection is based on known file name extensions.
- * <p>
- * The given name can also be a URL or a full file path. In such cases
- * only the file name part of the string is used for type detection.
- *
- * @param name the file name of the document
- * @return detected media type
- */
- public String detect(String name) {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- try {
- return detect(null, metadata);
- } catch (IOException e) {
- throw new IllegalStateException("Unexpected IOException", e);
- }
- }
-
- /**
- * Parses the given document and returns the extracted text content.
- * Input metadata like a file name or a content type hint can be passed
- * in the given metadata instance. Metadata information extracted from
- * the document is returned in that same metadata instance.
- *
- * @param stream the document to be parsed
- * @return extracted text content
- * @throws IOException if the document can not be read or parsed
- */
- public Reader parse(InputStream stream, Metadata metadata)
- throws IOException {
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- return new ParsingReader(parser, stream, metadata, context);
- }
-
- /**
- * Parses the given document and returns the extracted text content.
- *
- * @param stream the document to be parsed
- * @return extracted text content
- * @throws IOException if the document can not be read or parsed
- */
- public Reader parse(InputStream stream) throws IOException {
- return parse(stream, new Metadata());
- }
-
- /**
- * Parses the given file and returns the extracted text content.
- *
- * @param file the file to be parsed
- * @return extracted text content
- * @throws IOException if the file can not be read or parsed
- */
- public Reader parse(File file) throws IOException {
- return parse(file.toURI().toURL());
- }
-
- /**
- * Parses the resource at the given URL and returns the extracted
- * text content.
- *
- * @param url the URL of the resource to be parsed
- * @return extracted text content
- * @throws IOException if the resource can not be read or parsed
- */
- public Reader parse(URL url) throws IOException {
- Metadata metadata = new Metadata();
- InputStream stream = TikaInputStream.get(url, metadata);
- return parse(stream, metadata);
- }
-
- /**
- * Parses the given document and returns the extracted text content.
- * The given input stream is closed by this method.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- *
- * @param stream the document to be parsed
- * @param metadata document metadata
- * @return extracted text content
- * @throws IOException if the document can not be read
- * @throws TikaException if the document can not be parsed
- */
- public String parseToString(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- WriteOutContentHandler handler =
- new WriteOutContentHandler(maxStringLength);
- try {
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- parser.parse(
- stream, new BodyContentHandler(handler), metadata, context);
- } catch (SAXException e) {
- if (!handler.isWriteLimitReached(e)) {
- // This should never happen with BodyContentHandler...
- throw new TikaException("Unexpected SAX processing failure", e);
- }
- } finally {
- stream.close();
- }
- return handler.toString();
- }
-
- /**
- * Parses the given document and returns the extracted text content.
- * The given input stream is closed by this method.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- *
- * @param stream the document to be parsed
- * @return extracted text content
- * @throws IOException if the document can not be read
- * @throws TikaException if the document can not be parsed
- */
- public String parseToString(InputStream stream)
- throws IOException, TikaException {
- return parseToString(stream, new Metadata());
- }
-
- /**
- * Parses the given file and returns the extracted text content.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- *
- * @param file the file to be parsed
- * @return extracted text content
- * @throws IOException if the file can not be read
- * @throws TikaException if the file can not be parsed
- */
- public String parseToString(File file) throws IOException, TikaException {
- return parseToString(file.toURI().toURL());
- }
-
- /**
- * Parses the resource at the given URL and returns the extracted
- * text content.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- *
- * @param url the URL of the resource to be parsed
- * @return extracted text content
- * @throws IOException if the resource can not be read
- * @throws TikaException if the resource can not be parsed
- */
- public String parseToString(URL url) throws IOException, TikaException {
- Metadata metadata = new Metadata();
- InputStream stream = TikaInputStream.get(url, metadata);
- return parseToString(stream, metadata);
- }
-
- /**
- * Returns the maximum length of strings returned by the
- * parseToString methods.
- *
- * @since Apache Tika 0.7
- * @return maximum string length, or -1 if the limit has been disabled
- */
- public int getMaxStringLength() {
- return maxStringLength;
- }
-
- /**
- * Sets the maximum length of strings returned by the parseToString
- * methods.
- *
- * @since Apache Tika 0.7
- * @param maxStringLength maximum string length,
- * or -1 to disable this limit
- */
- public void setMaxStringLength(int maxStringLength) {
- this.maxStringLength = maxStringLength;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.URL;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Facade class for accessing Tika functionality. This class hides much of
+ * the underlying complexity of the lower level Tika classes and provides
+ * simple methods for many common parsing and type detection operations.
+ *
+ * @since Apache Tika 0.5
+ * @see Parser
+ * @see Detector
+ */
+public class Tika {
+
+ /**
+ * The detector instance used by this facade.
+ */
+ private final Detector detector;
+
+ /**
+ * The parser instance used by this facade.
+ */
+ private final Parser parser;
+
+ /**
+ * Maximum length of the strings returned by the parseToString methods.
+ * Used to prevent out of memory problems with huge input documents.
+ * The default setting is 100k characters.
+ */
+ private int maxStringLength = 100 * 1000;
+
+ /**
+ * Creates a Tika facade using the given detector and parser instances.
+ *
+ * @since Apache Tika 0.8
+ * @param detector type detector
+ * @param parser document parser
+ */
+ public Tika(Detector detector, Parser parser) {
+ this.detector = detector;
+ this.parser = parser;
+ }
+
+ /**
+ * Creates a Tika facade using the given configuration.
+ *
+ * @param config Tika configuration
+ */
+ public Tika(TikaConfig config) {
+ this(config.getMimeRepository(), new AutoDetectParser(config));
+ }
+
+ /**
+ * Creates a Tika facade using the default configuration.
+ */
+ public Tika() {
+ this(TikaConfig.getDefaultConfig());
+ }
+
+ /**
+ * Creates a Tika facade using the given detector instance and the
+ * default parser configuration.
+ *
+ * @since Apache Tika 0.8
+ * @param detector type detector
+ */
+ public Tika(Detector detector) {
+ this(detector, new AutoDetectParser(detector));
+ }
+
+
+ /**
+ * Detects the media type of the given document. The type detection is
+ * based on the content of the given document stream and any given
+ * document metadata. The document stream can be <code>null</code>,
+ * in which case only the given document metadata is used for type
+ * detection.
+ * <p>
+ * If the document stream supports the
+ * {@link InputStream#markSupported() mark feature}, then the stream is
+ * marked and reset to the original position before this method returns.
+ * Only a limited number of bytes are read from the stream.
+ * <p>
+ * The given document stream is <em>not</em> closed by this method.
+ * <p>
+ * Unlike in the {@link #parse(InputStream, Metadata)} method, the
+ * given document metadata is <em>not</em> modified by this method.
+ *
+ * @param stream the document stream, or <code>null</code>
+ * @param metadata document metadata
+ * @return detected media type
+ * @throws IOException if the stream can not be read
+ */
+ public String detect(InputStream stream, Metadata metadata)
+ throws IOException {
+ if (stream == null || stream.markSupported()) {
+ return detector.detect(stream, metadata).toString();
+ } else {
+ return detector.detect(
+ new BufferedInputStream(stream), metadata).toString();
+ }
+ }
+
+ /**
+ * Detects the media type of the given document. The type detection is
+ * based on the content of the given document stream.
+ * <p>
+ * If the document stream supports the
+ * {@link InputStream#markSupported() mark feature}, then the stream is
+ * marked and reset to the original position before this method returns.
+ * Only a limited number of bytes are read from the stream.
+ * <p>
+ * The given document stream is <em>not</em> closed by this method.
+ *
+ * @param stream the document stream
+ * @return detected media type
+ * @throws IOException if the stream can not be read
+ */
+ public String detect(InputStream stream) throws IOException {
+ return detect(stream, new Metadata());
+ }
+
+ /**
+ * Detects the media type of the given file. The type detection is
+ * based on the document content and a potential known file extension.
+ * <p>
+ * Use the {@link #detect(String)} method when you want to detect the
+ * type of the document without actually accessing the file.
+ *
+ * @param file the file
+ * @return detected media type
+ * @throws IOException if the file can not be read
+ */
+ public String detect(File file) throws IOException {
+ return detect(file.toURI().toURL());
+ }
+
+ /**
+ * Detects the media type of the resource at the given URL. The type
+ * detection is based on the document content and a potential known
+ * file extension included in the URL.
+ * <p>
+ * Use the {@link #detect(String)} method when you want to detect the
+ * type of the document without actually accessing the URL.
+ *
+ * @param url the URL of the resource
+ * @return detected media type
+ * @throws IOException if the resource can not be read
+ */
+ public String detect(URL url) throws IOException {
+ Metadata metadata = new Metadata();
+ InputStream stream = TikaInputStream.get(url, metadata);
+ try {
+ return detect(stream, metadata);
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Detects the media type of a document with the given file name.
+ * The type detection is based on known file name extensions.
+ * <p>
+ * The given name can also be a URL or a full file path. In such cases
+ * only the file name part of the string is used for type detection.
+ *
+ * @param name the file name of the document
+ * @return detected media type
+ */
+ public String detect(String name) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ try {
+ return detect(null, metadata);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unexpected IOException", e);
+ }
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ * Input metadata like a file name or a content type hint can be passed
+ * in the given metadata instance. Metadata information extracted from
+ * the document is returned in that same metadata instance.
+ *
+ * @param stream the document to be parsed
+ * @return extracted text content
+ * @throws IOException if the document can not be read or parsed
+ */
+ public Reader parse(InputStream stream, Metadata metadata)
+ throws IOException {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ return new ParsingReader(parser, stream, metadata, context);
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ *
+ * @param stream the document to be parsed
+ * @return extracted text content
+ * @throws IOException if the document can not be read or parsed
+ */
+ public Reader parse(InputStream stream) throws IOException {
+ return parse(stream, new Metadata());
+ }
+
+ /**
+ * Parses the given file and returns the extracted text content.
+ *
+ * @param file the file to be parsed
+ * @return extracted text content
+ * @throws IOException if the file can not be read or parsed
+ */
+ public Reader parse(File file) throws IOException {
+ return parse(file.toURI().toURL());
+ }
+
+ /**
+ * Parses the resource at the given URL and returns the extracted
+ * text content.
+ *
+ * @param url the URL of the resource to be parsed
+ * @return extracted text content
+ * @throws IOException if the resource can not be read or parsed
+ */
+ public Reader parse(URL url) throws IOException {
+ Metadata metadata = new Metadata();
+ InputStream stream = TikaInputStream.get(url, metadata);
+ return parse(stream, metadata);
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ * The given input stream is closed by this method.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
+ *
+ * @param stream the document to be parsed
+ * @param metadata document metadata
+ * @return extracted text content
+ * @throws IOException if the document can not be read
+ * @throws TikaException if the document can not be parsed
+ */
+ public String parseToString(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ WriteOutContentHandler handler =
+ new WriteOutContentHandler(maxStringLength);
+ try {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ parser.parse(
+ stream, new BodyContentHandler(handler), metadata, context);
+ } catch (SAXException e) {
+ if (!handler.isWriteLimitReached(e)) {
+ // This should never happen with BodyContentHandler...
+ throw new TikaException("Unexpected SAX processing failure", e);
+ }
+ } finally {
+ stream.close();
+ }
+ return handler.toString();
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ * The given input stream is closed by this method.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
+ *
+ * @param stream the document to be parsed
+ * @return extracted text content
+ * @throws IOException if the document can not be read
+ * @throws TikaException if the document can not be parsed
+ */
+ public String parseToString(InputStream stream)
+ throws IOException, TikaException {
+ return parseToString(stream, new Metadata());
+ }
+
+ /**
+ * Parses the given file and returns the extracted text content.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
+ *
+ * @param file the file to be parsed
+ * @return extracted text content
+ * @throws IOException if the file can not be read
+ * @throws TikaException if the file can not be parsed
+ */
+ public String parseToString(File file) throws IOException, TikaException {
+ return parseToString(file.toURI().toURL());
+ }
+
+ /**
+ * Parses the resource at the given URL and returns the extracted
+ * text content.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
+ *
+ * @param url the URL of the resource to be parsed
+ * @return extracted text content
+ * @throws IOException if the resource can not be read
+ * @throws TikaException if the resource can not be parsed
+ */
+ public String parseToString(URL url) throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = TikaInputStream.get(url, metadata);
+ return parseToString(stream, metadata);
+ }
+
+ /**
+ * Returns the maximum length of strings returned by the
+ * parseToString methods.
+ *
+ * @since Apache Tika 0.7
+ * @return maximum string length, or -1 if the limit has been disabled
+ */
+ public int getMaxStringLength() {
+ return maxStringLength;
+ }
+
+ /**
+ * Sets the maximum length of strings returned by the parseToString
+ * methods.
+ *
+ * @since Apache Tika 0.7
+ * @param maxStringLength maximum string length,
+ * or -1 to disable this limit
+ */
+ public void setMaxStringLength(int maxStringLength) {
+ this.maxStringLength = maxStringLength;
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java Thu Sep 2 14:46:46 2010
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
-
-/**
- * Content type detector that combines multiple different detection mechanisms.
- */
-public class CompositeDetector implements Detector {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = 5980683158436430252L;
-
- private final MediaTypeRegistry registry;
-
- private final List<Detector> detectors;
-
- public CompositeDetector(
- MediaTypeRegistry registry, List<Detector> detectors) {
- this.registry = registry;
- this.detectors = detectors;
- }
-
- public CompositeDetector(List<Detector> detectors) {
- this(new MediaTypeRegistry(), detectors);
- }
-
- public CompositeDetector(Detector... detectors) {
- this(Arrays.asList(detectors));
- }
-
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- MediaType type = MediaType.OCTET_STREAM;
- for (Detector detector : detectors) {
- MediaType detected = detector.detect(input, metadata);
- if (registry.isSpecializationOf(detected, type)) {
- type = detected;
- }
- }
- return type;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+
+/**
+ * Content type detector that combines multiple different detection mechanisms.
+ */
+public class CompositeDetector implements Detector {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 5980683158436430252L;
+
+ private final MediaTypeRegistry registry;
+
+ private final List<Detector> detectors;
+
+ public CompositeDetector(
+ MediaTypeRegistry registry, List<Detector> detectors) {
+ this.registry = registry;
+ this.detectors = detectors;
+ }
+
+ public CompositeDetector(List<Detector> detectors) {
+ this(new MediaTypeRegistry(), detectors);
+ }
+
+ public CompositeDetector(Detector... detectors) {
+ this(Arrays.asList(detectors));
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ MediaType type = MediaType.OCTET_STREAM;
+ for (Detector detector : detectors) {
+ MediaType detected = detector.detect(input, metadata);
+ if (registry.isSpecializationOf(detected, type)) {
+ type = detected;
+ }
+ }
+ return type;
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java Thu Sep 2 14:46:46 2010
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detector. Implementations of this interface use various
- * heuristics to detect the content type of a document based on given
- * input metadata or the first few bytes of the document stream.
- *
- * @since Apache Tika 0.3
- */
-public interface Detector extends Serializable {
-
- /**
- * Detects the content type of the given input document. Returns
- * <code>application/octet-stream</code> if the type of the document
- * can not be detected.
- * <p>
- * If the document input stream is not available, then the first
- * argument may be <code>null</code>. Otherwise the detector may
- * read bytes from the start of the stream to help in type detection.
- * The given stream is guaranteed to support the
- * {@link InputStream#markSupported() mark feature} and the detector
- * is expected to {@link InputStream#mark(int) mark} the stream before
- * reading any bytes from it, and to {@link InputStream#reset() reset}
- * the stream before returning. The stream must not be closed by the
- * detector.
- * <p>
- * The given input metadata is only read, not modified, by the detector.
- *
- * @param input document input stream, or <code>null</code>
- * @param metadata input metadata for the document
- * @return detected media type, or <code>application/octet-stream</code>
- * @throws IOException if the document input stream could not be read
- */
- MediaType detect(InputStream input, Metadata metadata) throws IOException;
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detector. Implementations of this interface use various
+ * heuristics to detect the content type of a document based on given
+ * input metadata or the first few bytes of the document stream.
+ *
+ * @since Apache Tika 0.3
+ */
+public interface Detector extends Serializable {
+
+ /**
+ * Detects the content type of the given input document. Returns
+ * <code>application/octet-stream</code> if the type of the document
+ * can not be detected.
+ * <p>
+ * If the document input stream is not available, then the first
+ * argument may be <code>null</code>. Otherwise the detector may
+ * read bytes from the start of the stream to help in type detection.
+ * The given stream is guaranteed to support the
+ * {@link InputStream#markSupported() mark feature} and the detector
+ * is expected to {@link InputStream#mark(int) mark} the stream before
+ * reading any bytes from it, and to {@link InputStream#reset() reset}
+ * the stream before returning. The stream must not be closed by the
+ * detector.
+ * <p>
+ * The given input metadata is only read, not modified, by the detector.
+ *
+ * @param input document input stream, or <code>null</code>
+ * @param metadata input metadata for the document
+ * @return detected media type, or <code>application/octet-stream</code>
+ * @throws IOException if the document input stream could not be read
+ */
+ MediaType detect(InputStream input, Metadata metadata) throws IOException;
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/Detector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Thu Sep 2 14:46:46 2010
@@ -1,216 +1,216 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on magic bytes, i.e. type-specific patterns
- * near the beginning of the document input stream.
- *
- * @since Apache Tika 0.3
- */
-public class MagicDetector implements Detector {
-
- /**
- * The matching media type. Returned by the
- * {@link #detect(InputStream, Metadata)} method if a match is found.
- */
- private final MediaType type;
-
- /**
- * Length of the comparison window. All the byte arrays here are this long.
- */
- private final int length;
-
- /**
- * The magic match pattern. If this byte pattern is equal to the
- * possibly bit-masked bytes from the input stream, then the type
- * detection succeeds and the configured {@link #type} is returned.
- */
- private final byte[] pattern;
-
- /**
- * Bit mask that is applied to the source bytes before pattern matching.
- */
- private final byte[] mask;
-
- /**
- * First offset (inclusive) of the comparison window within the
- * document input stream. Greater than or equal to zero.
- */
- private final int offsetRangeBegin;
-
- /**
- * Last offset (inclusive) of the comparison window within the document
- * input stream. Greater than or equal to the
- * {@link #offsetRangeBegin first offset}.
- * <p>
- * Note that this is <em>not</em> the offset of the last byte read from
- * the document stream. Instead, the last window of bytes to be compared
- * starts at this offset.
- */
- private final int offsetRangeEnd;
-
- private final String asString;
-
- /**
- * Creates a detector for input documents that have the exact given byte
- * pattern at the beginning of the document stream.
- *
- * @param type matching media type
- * @param pattern magic match pattern
- */
- public MagicDetector(MediaType type, byte[] pattern) {
- this(type, pattern, 0);
- }
-
- /**
- * Creates a detector for input documents that have the exact given byte
- * pattern at the given offset of the document stream.
- *
- * @param type matching media type
- * @param pattern magic match pattern
- * @param offset offset of the pattern match
- */
- public MagicDetector(MediaType type, byte[] pattern, int offset) {
- this(type, pattern, null, offset, offset);
- }
-
- /**
- * Creates a detector for input documents that meet the specified
- * magic match.
- */
- public MagicDetector(
- MediaType type, byte[] pattern, byte[] mask,
- int offsetRangeBegin, int offsetRangeEnd) {
- if (type == null) {
- throw new IllegalArgumentException("Matching media type is null");
- } else if (pattern == null) {
- throw new IllegalArgumentException("Magic match pattern is null");
- } else if (offsetRangeBegin < 0
- || offsetRangeEnd < offsetRangeBegin) {
- throw new IllegalArgumentException(
- "Invalid offset range: ["
- + offsetRangeBegin + "," + offsetRangeEnd + "]");
- }
-
- this.type = type;
-
- this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
-
- this.mask = new byte[length];
- this.pattern = new byte[length];
-
- for (int i = 0; i < length; i++) {
- if (mask != null && i < mask.length) {
- this.mask[i] = mask[i];
- } else {
- this.mask[i] = -1;
- }
-
- if (i < pattern.length) {
- this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
- } else {
- this.pattern[i] = 0;
- }
- }
-
- this.offsetRangeBegin = offsetRangeBegin;
- this.offsetRangeEnd = offsetRangeEnd;
-
- // Build the string representation. Needs to be unique, as
- // these get compared. Compute now as may get compared a lot!
- this.asString = "Magic Detection for " + type.toString() +
- " looking for " + pattern.length +
- " bytes = " + this.pattern +
- " mask = " + this.mask;
- }
-
- /**
- *
- * @param input document input stream, or <code>null</code>
- * @param metadata ignored
- */
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- if (input == null) {
- return MediaType.OCTET_STREAM;
- }
-
- input.mark(offsetRangeEnd + length);
- try {
- int offset = 0;
-
- // Skip bytes at the beginning, using skip() or read()
- while (offset < offsetRangeBegin) {
- long n = input.skip(offsetRangeBegin - offset);
- if (n > 0) {
- offset += n;
- } else if (input.read() != -1) {
- offset += 1;
- } else {
- return MediaType.OCTET_STREAM;
- }
- }
-
- // Fill in the comparison window
- byte[] buffer =
- new byte[length + (offsetRangeEnd - offsetRangeBegin)];
- int n = input.read(buffer);
- if (n > 0) {
- offset += n;
- }
- while (n != -1 && offset < offsetRangeEnd + length) {
- int bufferOffset = offset - offsetRangeBegin;
- n = input.read(
- buffer, bufferOffset, buffer.length - bufferOffset);
- }
- if (offset < offsetRangeBegin + length) {
- return MediaType.OCTET_STREAM;
- }
-
- // Loop until we've covered the entire offset range
- for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
- boolean match = true;
- for (int j = 0; match && j < length; j++) {
- match = (buffer[i + j] & mask[j]) == pattern[j];
- }
- if (match) {
- return type;
- }
- }
-
- return MediaType.OCTET_STREAM;
- } finally {
- input.reset();
- }
- }
-
- /**
- * Returns a string representation of the Detection Rule.
- * Should sort nicely by type and details, as we sometimes
- * compare these.
- */
- public String toString() {
- return asString;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on magic bytes, i.e. type-specific patterns
+ * near the beginning of the document input stream.
+ *
+ * @since Apache Tika 0.3
+ */
+public class MagicDetector implements Detector {
+
+ /**
+ * The matching media type. Returned by the
+ * {@link #detect(InputStream, Metadata)} method if a match is found.
+ */
+ private final MediaType type;
+
+ /**
+ * Length of the comparison window. All the byte arrays here are this long.
+ */
+ private final int length;
+
+ /**
+ * The magic match pattern. If this byte pattern is equal to the
+ * possibly bit-masked bytes from the input stream, then the type
+ * detection succeeds and the configured {@link #type} is returned.
+ */
+ private final byte[] pattern;
+
+ /**
+ * Bit mask that is applied to the source bytes before pattern matching.
+ */
+ private final byte[] mask;
+
+ /**
+ * First offset (inclusive) of the comparison window within the
+ * document input stream. Greater than or equal to zero.
+ */
+ private final int offsetRangeBegin;
+
+ /**
+ * Last offset (inclusive) of the comparison window within the document
+ * input stream. Greater than or equal to the
+ * {@link #offsetRangeBegin first offset}.
+ * <p>
+ * Note that this is <em>not</em> the offset of the last byte read from
+ * the document stream. Instead, the last window of bytes to be compared
+ * starts at this offset.
+ */
+ private final int offsetRangeEnd;
+
+ private final String asString;
+
+ /**
+ * Creates a detector for input documents that have the exact given byte
+ * pattern at the beginning of the document stream.
+ *
+ * @param type matching media type
+ * @param pattern magic match pattern
+ */
+ public MagicDetector(MediaType type, byte[] pattern) {
+ this(type, pattern, 0);
+ }
+
+ /**
+ * Creates a detector for input documents that have the exact given byte
+ * pattern at the given offset of the document stream.
+ *
+ * @param type matching media type
+ * @param pattern magic match pattern
+ * @param offset offset of the pattern match
+ */
+ public MagicDetector(MediaType type, byte[] pattern, int offset) {
+ this(type, pattern, null, offset, offset);
+ }
+
+ /**
+ * Creates a detector for input documents that meet the specified
+ * magic match.
+ */
+ public MagicDetector(
+ MediaType type, byte[] pattern, byte[] mask,
+ int offsetRangeBegin, int offsetRangeEnd) {
+ if (type == null) {
+ throw new IllegalArgumentException("Matching media type is null");
+ } else if (pattern == null) {
+ throw new IllegalArgumentException("Magic match pattern is null");
+ } else if (offsetRangeBegin < 0
+ || offsetRangeEnd < offsetRangeBegin) {
+ throw new IllegalArgumentException(
+ "Invalid offset range: ["
+ + offsetRangeBegin + "," + offsetRangeEnd + "]");
+ }
+
+ this.type = type;
+
+ this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
+
+ this.mask = new byte[length];
+ this.pattern = new byte[length];
+
+ for (int i = 0; i < length; i++) {
+ if (mask != null && i < mask.length) {
+ this.mask[i] = mask[i];
+ } else {
+ this.mask[i] = -1;
+ }
+
+ if (i < pattern.length) {
+ this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
+ } else {
+ this.pattern[i] = 0;
+ }
+ }
+
+ this.offsetRangeBegin = offsetRangeBegin;
+ this.offsetRangeEnd = offsetRangeEnd;
+
+ // Build the string representation. Needs to be unique, as
+ // these get compared. Compute now as may get compared a lot!
+ this.asString = "Magic Detection for " + type.toString() +
+ " looking for " + pattern.length +
+ " bytes = " + this.pattern +
+ " mask = " + this.mask;
+ }
+
+ /**
+ *
+ * @param input document input stream, or <code>null</code>
+ * @param metadata ignored
+ */
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ input.mark(offsetRangeEnd + length);
+ try {
+ int offset = 0;
+
+ // Skip bytes at the beginning, using skip() or read()
+ while (offset < offsetRangeBegin) {
+ long n = input.skip(offsetRangeBegin - offset);
+ if (n > 0) {
+ offset += n;
+ } else if (input.read() != -1) {
+ offset += 1;
+ } else {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ // Fill in the comparison window
+ byte[] buffer =
+ new byte[length + (offsetRangeEnd - offsetRangeBegin)];
+ int n = input.read(buffer);
+ if (n > 0) {
+ offset += n;
+ }
+ while (n != -1 && offset < offsetRangeEnd + length) {
+ int bufferOffset = offset - offsetRangeBegin;
+ n = input.read(
+ buffer, bufferOffset, buffer.length - bufferOffset);
+ }
+ if (offset < offsetRangeBegin + length) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ // Loop until we've covered the entire offset range
+ for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
+ boolean match = true;
+ for (int j = 0; match && j < length; j++) {
+ match = (buffer[i + j] & mask[j]) == pattern[j];
+ }
+ if (match) {
+ return type;
+ }
+ }
+
+ return MediaType.OCTET_STREAM;
+ } finally {
+ input.reset();
+ }
+ }
+
+ /**
+ * Returns a string representation of the Detection Rule.
+ * Should sort nicely by type and details, as we sometimes
+ * compare these.
+ */
+ public String toString() {
+ return asString;
+ }
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java Thu Sep 2 14:46:46 2010
@@ -1,143 +1,143 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on the resource name. An instance of this
- * class contains a set of regular expression patterns that are matched
- * against the resource name potentially given as a part of the input metadata.
- * <p>
- * If a pattern matches the given name, then the media type associated with
- * that pattern is returned as the likely content type of the input document.
- * Otherwise the returned type is <code>application/octet-stream</code>.
- * <p>
- * See the {@link #detect(InputStream, Metadata)} method for more details
- * of the matching algorithm.
- *
- * @since Apache Tika 0.3
- */
-public class NameDetector implements Detector {
-
- /**
- * The regular expression patterns used for type detection.
- */
- private final Map<Pattern, MediaType> patterns;
-
- /**
- * Creates a new content type detector based on the given name patterns.
- * The given pattern map is not copied, so the caller may update the
- * mappings even after this detector instance has been created. However,
- * the map <em>must not be concurrently modified</em> while this instance
- * is used for type detection.
- *
- * @param patterns map from name patterns to corresponding media types
- */
- public NameDetector(Map<Pattern, MediaType> patterns) {
- this.patterns = patterns;
- }
-
- /**
- * Detects the content type of an input document based on the document
- * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
- * the given input metadata is expected to contain the name (normally
- * a file name or a URL) of the input document.
- * <p>
- * If a resource name is given, then it is first processed as follows.
- * <ol>
- * <li>
- * Potential URL query (?...) and fragment identifier (#...)
- * parts are removed from the end of the resource name.
- * </li>
- * <li>
- * Potential leading path elements (up to the last slash or backslash)
- * are removed from the beginning of the resource name.
- * </li>
- * <li>
- * Potential URL encodings (%nn, in UTF-8) are decoded.
- * </li>
- * <li>
- * Any leading and trailing whitespace is removed.
- * </li>
- * </ol>
- * <p>
- * The resulting name string (if any) is then matched in sequence against
- * all the configured name patterns. If a match is found, then the (first)
- * matching media type is returned.
- *
- * @param input ignored
- * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
- * @return detected media type, or <code>application/octet-stream</code>
- */
- public MediaType detect(InputStream input, Metadata metadata) {
- // Look for a resource name in the input metadata
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
- // If the name is a URL, skip the trailing query and fragment parts
- int question = name.indexOf('?');
- if (question != -1) {
- name = name.substring(0, question);
- }
- int hash = name.indexOf('#');
- if (hash != -1) {
- name = name.substring(0, hash);
- }
-
- // If the name is a URL or a path, skip all but the last component
- int slash = name.lastIndexOf('/');
- if (slash != -1) {
- name = name.substring(slash + 1);
- }
- int backslash = name.lastIndexOf('\\');
- if (backslash != -1) {
- name = name.substring(backslash + 1);
- }
-
- // Decode any potential URL encoding
- int percent = name.indexOf('%');
- if (percent != -1) {
- try {
- name = URLDecoder.decode(name, "UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new IllegalStateException("UTF-8 not supported", e);
- }
- }
-
- // Skip any leading or trailing whitespace
- name = name.trim();
- if (name.length() > 0) {
- // Match the name against the registered patterns
- for (Pattern pattern : patterns.keySet()) {
- if (pattern.matcher(name).matches()) {
- return patterns.get(pattern);
- }
- }
- }
- }
-
- return MediaType.OCTET_STREAM;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on the resource name. An instance of this
+ * class contains a set of regular expression patterns that are matched
+ * against the resource name potentially given as a part of the input metadata.
+ * <p>
+ * If a pattern matches the given name, then the media type associated with
+ * that pattern is returned as the likely content type of the input document.
+ * Otherwise the returned type is <code>application/octet-stream</code>.
+ * <p>
+ * See the {@link #detect(InputStream, Metadata)} method for more details
+ * of the matching algorithm.
+ *
+ * @since Apache Tika 0.3
+ */
+public class NameDetector implements Detector {
+
+ /**
+ * The regular expression patterns used for type detection.
+ */
+ private final Map<Pattern, MediaType> patterns;
+
+ /**
+ * Creates a new content type detector based on the given name patterns.
+ * The given pattern map is not copied, so the caller may update the
+ * mappings even after this detector instance has been created. However,
+ * the map <em>must not be concurrently modified</em> while this instance
+ * is used for type detection.
+ *
+ * @param patterns map from name patterns to corresponding media types
+ */
+ public NameDetector(Map<Pattern, MediaType> patterns) {
+ this.patterns = patterns;
+ }
+
+ /**
+ * Detects the content type of an input document based on the document
+ * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
+ * the given input metadata is expected to contain the name (normally
+ * a file name or a URL) of the input document.
+ * <p>
+ * If a resource name is given, then it is first processed as follows.
+ * <ol>
+ * <li>
+ * Potential URL query (?...) and fragment identifier (#...)
+ * parts are removed from the end of the resource name.
+ * </li>
+ * <li>
+ * Potential leading path elements (up to the last slash or backslash)
+ * are removed from the beginning of the resource name.
+ * </li>
+ * <li>
+ * Potential URL encodings (%nn, in UTF-8) are decoded.
+ * </li>
+ * <li>
+ * Any leading and trailing whitespace is removed.
+ * </li>
+ * </ol>
+ * <p>
+ * The resulting name string (if any) is then matched in sequence against
+ * all the configured name patterns. If a match is found, then the (first)
+ * matching media type is returned.
+ *
+ * @param input ignored
+ * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
+ * @return detected media type, or <code>application/octet-stream</code>
+ */
+ public MediaType detect(InputStream input, Metadata metadata) {
+ // Look for a resource name in the input metadata
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ // If the name is a URL, skip the trailing query and fragment parts
+ int question = name.indexOf('?');
+ if (question != -1) {
+ name = name.substring(0, question);
+ }
+ int hash = name.indexOf('#');
+ if (hash != -1) {
+ name = name.substring(0, hash);
+ }
+
+ // If the name is a URL or a path, skip all but the last component
+ int slash = name.lastIndexOf('/');
+ if (slash != -1) {
+ name = name.substring(slash + 1);
+ }
+ int backslash = name.lastIndexOf('\\');
+ if (backslash != -1) {
+ name = name.substring(backslash + 1);
+ }
+
+ // Decode any potential URL encoding
+ int percent = name.indexOf('%');
+ if (percent != -1) {
+ try {
+ name = URLDecoder.decode(name, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalStateException("UTF-8 not supported", e);
+ }
+ }
+
+ // Skip any leading or trailing whitespace
+ name = name.trim();
+ if (name.length() > 0) {
+ // Match the name against the registered patterns
+ for (Pattern pattern : patterns.keySet()) {
+ if (pattern.matcher(name).matches()) {
+ return patterns.get(pattern);
+ }
+ }
+ }
+ }
+
+ return MediaType.OCTET_STREAM;
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Thu Sep 2 14:46:46 2010
@@ -1,115 +1,115 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection of plain text documents. This detector looks at the
- * beginning of the document input stream and considers the document to be
- * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
- * found.
- * <p>
- * Note that text documents with a character encoding like UTF-16 are better
- * detected with {@link MagicDetector} and an appropriate magic byte pattern.
- *
- * @since Apache Tika 0.3
- */
-public class TextDetector implements Detector {
-
- /**
- * The number of bytes from the beginning of the document stream
- * to test for control bytes.
- */
- private static final int NUMBER_OF_BYTES_TO_TEST = 512;
-
- /**
- * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
- * in the range below 0x20 (the space character). If an entry in this
- * table is <code>true</code> then that byte is very unlikely to occur
- * in a plain text document.
- * <p>
- * The contents of this lookup table are based on the following definition
- * from section 4 of the "Content-Type Processing Model" Internet-draft
- * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
- * >draft-abarth-mime-sniff-01</a>).
- * <pre>
- * +-------------------------+
- * | Binary data byte ranges |
- * +-------------------------+
- * | 0x00 -- 0x08 |
- * | 0x0B |
- * | 0x0E -- 0x1A |
- * | 0x1C -- 0x1F |
- * +-------------------------+
- * </pre>
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
- */
- private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
-
- static {
- Arrays.fill(IS_CONTROL_BYTE, true);
- IS_CONTROL_BYTE[0x09] = false; // tabulator
- IS_CONTROL_BYTE[0x0A] = false; // new line
- IS_CONTROL_BYTE[0x0C] = false; // new page
- IS_CONTROL_BYTE[0x0D] = false; // carriage return
- IS_CONTROL_BYTE[0x1B] = false; // escape
- }
-
- /**
- * Looks at the beginning of the document input stream to determine
- * whether the document is text or not.
- *
- * @param input document input stream, or <code>null</code>
- * @param metadata ignored
- * @return "text/plain" if the input stream suggest a text document,
- * "application/octet-stream" otherwise
- */
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- if (input == null) {
- return MediaType.OCTET_STREAM;
- }
-
- input.mark(NUMBER_OF_BYTES_TO_TEST);
- try {
- for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
- int ch = input.read();
- if (ch == -1) {
- if (i > 0) {
- return MediaType.TEXT_PLAIN;
- } else {
- // See https://issues.apache.org/jira/browse/TIKA-483
- return MediaType.OCTET_STREAM;
- }
- } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
- return MediaType.OCTET_STREAM;
- }
- }
- return MediaType.TEXT_PLAIN;
- } finally {
- input.reset();
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection of plain text documents. This detector looks at the
+ * beginning of the document input stream and considers the document to be
+ * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
+ * found.
+ * <p>
+ * Note that text documents with a character encoding like UTF-16 are better
+ * detected with {@link MagicDetector} and an appropriate magic byte pattern.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TextDetector implements Detector {
+
+ /**
+ * The number of bytes from the beginning of the document stream
+ * to test for control bytes.
+ */
+ private static final int NUMBER_OF_BYTES_TO_TEST = 512;
+
+ /**
+ * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+ * in the range below 0x20 (the space character). If an entry in this
+ * table is <code>true</code> then that byte is very unlikely to occur
+ * in a plain text document.
+ * <p>
+ * The contents of this lookup table are based on the following definition
+ * from section 4 of the "Content-Type Processing Model" Internet-draft
+ * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * >draft-abarth-mime-sniff-01</a>).
+ * <pre>
+ * +-------------------------+
+ * | Binary data byte ranges |
+ * +-------------------------+
+ * | 0x00 -- 0x08 |
+ * | 0x0B |
+ * | 0x0E -- 0x1A |
+ * | 0x1C -- 0x1F |
+ * +-------------------------+
+ * </pre>
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+ */
+ private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+
+ static {
+ Arrays.fill(IS_CONTROL_BYTE, true);
+ IS_CONTROL_BYTE[0x09] = false; // tabulator
+ IS_CONTROL_BYTE[0x0A] = false; // new line
+ IS_CONTROL_BYTE[0x0C] = false; // new page
+ IS_CONTROL_BYTE[0x0D] = false; // carriage return
+ IS_CONTROL_BYTE[0x1B] = false; // escape
+ }
+
+ /**
+ * Looks at the beginning of the document input stream to determine
+ * whether the document is text or not.
+ *
+ * @param input document input stream, or <code>null</code>
+ * @param metadata ignored
+ * @return "text/plain" if the input stream suggest a text document,
+ * "application/octet-stream" otherwise
+ */
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ input.mark(NUMBER_OF_BYTES_TO_TEST);
+ try {
+ for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
+ int ch = input.read();
+ if (ch == -1) {
+ if (i > 0) {
+ return MediaType.TEXT_PLAIN;
+ } else {
+ // See https://issues.apache.org/jira/browse/TIKA-483
+ return MediaType.OCTET_STREAM;
+ }
+ } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+ return MediaType.TEXT_PLAIN;
+ } finally {
+ input.reset();
+ }
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java Thu Sep 2 14:46:46 2010
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Content type detection based on a content type hint. This detector simply
- * trusts any valid content type hint given in the input metadata, and returns
- * that as the likely type of the input document.
- *
- * @since Apache Tika 0.3
- */
-public class TypeDetector implements Detector {
-
- /**
- * Detects the content type of an input document based on a type hint
- * given in the input metadata. The CONTENT_TYPE attribute of the given
- * input metadata is expected to contain the type of the input document.
- * If that attribute exists and contains a valid type name, then that
- * type is returned.
- *
- * @param input ignored
- * @param metadata input metadata, possibly with a CONTENT_TYPE value
- * @return detected media type, or <code>application/octet-stream</code>
- */
- public MediaType detect(InputStream input, Metadata metadata) {
- // Look for a type hint in the input metadata
- String hint = metadata.get(Metadata.CONTENT_TYPE);
- if (hint != null) {
- MediaType type = MediaType.parse(hint);
- if (type != null) {
- return type;
- }
- }
- return MediaType.OCTET_STREAM;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on a content type hint. This detector simply
+ * trusts any valid content type hint given in the input metadata, and returns
+ * that as the likely type of the input document.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TypeDetector implements Detector {
+
+ /**
+ * Detects the content type of an input document based on a type hint
+ * given in the input metadata. The CONTENT_TYPE attribute of the given
+ * input metadata is expected to contain the type of the input document.
+ * If that attribute exists and contains a valid type name, then that
+ * type is returned.
+ *
+ * @param input ignored
+ * @param metadata input metadata, possibly with a CONTENT_TYPE value
+ * @return detected media type, or <code>application/octet-stream</code>
+ */
+ public MediaType detect(InputStream input, Metadata metadata) {
+ // Look for a type hint in the input metadata
+ String hint = metadata.get(Metadata.CONTENT_TYPE);
+ if (hint != null) {
+ MediaType type = MediaType.parse(hint);
+ if (type != null) {
+ return type;
+ }
+ }
+ return MediaType.OCTET_STREAM;
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java Thu Sep 2 14:46:46 2010
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.metadata;
-
-/**
- * XMP Paged-text schema. This is a collection of
- * {@link Property property definition} constants for the paged text
- * properties defined in the XMP standard.
- *
- * @since Apache Tika 0.8
- * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
- * >XMP Specification, Part 2: Standard Schemas</a>
- */
-public interface PagedText {
-
- /**
- * "The number of pages in the document (including any in contained
- * documents)."
- */
- Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * XMP Paged-text schema. This is a collection of
+ * {@link Property property definition} constants for the paged text
+ * properties defined in the XMP standard.
+ *
+ * @since Apache Tika 0.8
+ * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
+ * >XMP Specification, Part 2: Standard Schemas</a>
+ */
+public interface PagedText {
+
+ /**
+ * "The number of pages in the document (including any in contained
+ * documents)."
+ */
+ Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
------------------------------------------------------------------------------
svn:eol-style = native