You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/23 13:49:37 UTC

tika git commit: TIKA-2081 -- add fileUrl back into tika-server

Repository: tika
Updated Branches:
  refs/heads/master 3a5431e20 -> d612aea85


TIKA-2081 -- add fileUrl back into tika-server


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d612aea8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d612aea8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d612aea8

Branch: refs/heads/master
Commit: d612aea850060c7d77124f79c525f68032a11031
Parents: 3a5431e
Author: tballison <ta...@mitre.org>
Authored: Fri Sep 23 09:49:28 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Sep 23 09:49:28 2016 -0400

----------------------------------------------------------------------
 .../tika/server/DefaultInputStreamFactory.java  | 33 +++++++++++++
 .../apache/tika/server/InputStreamFactory.java  | 34 +++++++++++++
 .../org/apache/tika/server/TikaServerCli.java   | 25 +++++++++-
 .../server/URLEnabledInputStreamFactory.java    | 52 ++++++++++++++++++++
 .../tika/server/resource/DetectorResource.java  |  2 +-
 .../tika/server/resource/MetadataResource.java  |  9 ++--
 .../resource/RecursiveMetadataResource.java     |  6 +--
 .../tika/server/resource/TikaResource.java      | 20 ++++++--
 .../tika/server/resource/UnpackerResource.java  |  8 +--
 .../org/apache/tika/server/CXFTestBase.java     |  4 +-
 10 files changed, 174 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
new file mode 100644
index 0000000..a2df856
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Passthrough -- returns InputStream as is
+ */
+public class DefaultInputStreamFactory implements InputStreamFactory {
+
+    @Override
+    public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException {
+        return is;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
new file mode 100644
index 0000000..27e7f86
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/InputStreamFactory.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Interface to allow for custom/consistent creation of InputStream
+ * <p>
+ * This factory is used statically in TikaResource.  Make sure not
+ * to hold instance state in implementations.
+ */
+public interface InputStreamFactory {
+
+    public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index 4804398..831a6d3 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -68,6 +68,13 @@ public class TikaServerCli {
             new HashSet<String>(Arrays.asList("debug", "info"));
     private static final Log logger = LogFactory.getLog(TikaServerCli.class);
 
+    private static final String FILE_URL_WARNING =
+            "WARNING: You have chosen to run tika-server with fileUrl enabled.\n"+
+            "Whoever has access to your service now has the same read permissions\n"+
+            "as tika-server. Users could request and receive a sensitive file from your\n" +
+            "drive or a webpage from your intranet.  See CVE-2015-3271.\n"+
+            "Please make sure you know what you are doing.";
+
     private static Options getOptions() {
         Options options = new Options();
         options.addOption("C", "cors", true, "origin allowed to make CORS requests (default=NONE)\nall allowed if \"all\"");
@@ -79,6 +86,8 @@ public class TikaServerCli {
         options.addOption("l", "log", true, "request URI log level ('debug' or 'info')");
         options.addOption("s", "includeStack", false, "whether or not to return a stack trace\nif there is an exception during 'parse'");
         options.addOption("?", "help", false, "this help message");
+        options.addOption("enable-unsecure-features", false, "this is required to enable fileUrl.");
+        options.addOption("enable-fileUrl", false, "allows user to pass in fileUrl instead of InputStream.");
 
         return options;
     }
@@ -166,8 +175,22 @@ public class TikaServerCli {
                         CommonsDigester.parse(line.getOptionValue("digest")));
             }
 
+            if (line.hasOption("enable-fileUrl") &&
+                    !line.hasOption("enable-unsecure-features")) {
+                System.err.println("If you want to enable fileUrl, you must also acknowledge the security risks\n"+
+                "by including --enable-unsecure-features.  See CVE-2015-3271.");
+                System.exit(-1);
+            }
+            InputStreamFactory inputStreamFactory = null;
+            if (line.hasOption("enable-fileUrl") &&
+                    line.hasOption("enable-unsecure-features")) {
+                inputStreamFactory = new URLEnabledInputStreamFactory();
+                System.out.println(FILE_URL_WARNING);
+            } else {
+                inputStreamFactory = new DefaultInputStreamFactory();
+            }
 
-            TikaResource.init(tika, digester);
+            TikaResource.init(tika, digester, inputStreamFactory);
             JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
 
             List<ResourceProvider> rCoreProviders = new ArrayList<ResourceProvider>();

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
new file mode 100644
index 0000000..10d4180
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/URLEnabledInputStreamFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import javax.ws.rs.core.HttpHeaders;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.io.TikaInputStream;
+
+/**
+ * This class looks for &quot;fileUrl&quot; in the http header.  If it is not null
+ * and not empty, this will return a new TikaInputStream from the URL.
+ * <p>
+ * This is not meant to be used in place of a robust, responsible crawler.  Rather, this
+ * is a convenience factory.
+ * <p>
+ * <em>WARNING:</em> Unless you carefully lock down access to the server,
+ * whoever has access to this service will have the read access of the server.
+ * In short, anyone with access to this service could request and get
+ * &quot;file:///etc/supersensitive_file_dont_read.txt&quot;.  Or, if your server has access
+ * to your intranet, and you let the public hit this service, they will now
+ * have access to your intranet.
+ * See <a href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271">CVE-2015-3271</a>
+ *
+ */
+public class URLEnabledInputStreamFactory implements InputStreamFactory {
+
+    @Override
+    public InputStream getInputSteam(InputStream is, HttpHeaders httpHeaders) throws IOException {
+        String fileUrl = httpHeaders.getHeaderString("fileUrl");
+        if(fileUrl != null && !"".equals(fileUrl)){
+            return TikaInputStream.get(new URL(fileUrl));
+        }
+        return is;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
index f1f5a29..9f19ad6 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
@@ -46,7 +46,7 @@ public class DetectorResource {
     public String detect(final InputStream is,
                          @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
         Metadata met = new Metadata();
-        TikaInputStream tis = TikaInputStream.get(is);
+        TikaInputStream tis = TikaInputStream.get(TikaResource.getInputStream(is, httpHeaders));
         String filename = TikaResource.detectFilename(httpHeaders
                 .getRequestHeaders());
         logger.info("Detecting media type for Filename: " + filename);

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
index 89d35e8..e5e5a1f 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
@@ -17,9 +17,6 @@
 
 package org.apache.tika.server.resource;
 
-import java.io.IOException;
-import java.io.InputStream;
-
 import javax.ws.rs.Consumes;
 import javax.ws.rs.POST;
 import javax.ws.rs.PUT;
@@ -31,6 +28,8 @@ import javax.ws.rs.core.HttpHeaders;
 import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriInfo;
+import java.io.IOException;
+import java.io.InputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -58,7 +57,7 @@ public class MetadataResource {
     @Produces({"text/csv", "application/json", "application/rdf+xml"})
     public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
         return Response.ok(
-                parseMetadata(is, httpHeaders.getRequestHeaders(), info)).build();
+                parseMetadata(TikaResource.getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info)).build();
     }
 
     /**
@@ -94,7 +93,7 @@ public class MetadataResource {
         Response.Status defaultErrorResponse = Response.Status.BAD_REQUEST;
         Metadata metadata = null;
         try {
-            metadata = parseMetadata(is, httpHeaders.getRequestHeaders(), info);
+            metadata = parseMetadata(TikaResource.getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info);
             // once we've parsed the document successfully, we should use NOT_FOUND
             // if we did not see the field
             defaultErrorResponse = Response.Status.NOT_FOUND;

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index aa4e0ab..b967f8b 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -17,8 +17,6 @@
 
 package org.apache.tika.server.resource;
 
-import java.io.InputStream;
-
 import javax.ws.rs.Consumes;
 import javax.ws.rs.POST;
 import javax.ws.rs.PUT;
@@ -30,6 +28,7 @@ import javax.ws.rs.core.HttpHeaders;
 import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriInfo;
+import java.io.InputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -118,7 +117,8 @@ public class RecursiveMetadataResource {
                                 @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName
                                 ) throws Exception {
         return Response.ok(
-                parseMetadata(is, httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
+                parseMetadata(TikaResource.getInputStream(is, httpHeaders),
+						httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
     }
 
 	private MetadataList parseMetadata(InputStream is,

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 566203a..c5150a1 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -73,6 +73,7 @@ import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.apache.tika.sax.RichTextContentHandler;
+import org.apache.tika.server.InputStreamFactory;
 import org.apache.tika.server.TikaServerParseException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -88,10 +89,13 @@ public class TikaResource {
 
     private static TikaConfig tikaConfig;
     private static DigestingParser.Digester digester = null;
+    private static InputStreamFactory inputStreamFactory = null;
 
-    public static void init(TikaConfig config, DigestingParser.Digester digestr) {
+    public static void init(TikaConfig config, DigestingParser.Digester digestr,
+                            InputStreamFactory iSF) {
         tikaConfig = config;
         digester = digestr;
+        inputStreamFactory = iSF;
     }
 
     static {
@@ -172,6 +176,14 @@ public class TikaResource {
         }
     }
 
+    public static InputStream getInputStream(InputStream is, HttpHeaders headers) {
+        try {
+            return inputStreamFactory.getInputSteam(is, headers);
+        } catch (IOException e) {
+            throw new TikaServerParseException(e);
+        }
+    }
+
     /**
      * Utility method to set a property on a class via reflection.
      *
@@ -337,7 +349,7 @@ public class TikaResource {
     @Consumes("*/*")
     @Produces("text/plain")
     public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
-        return produceText(is, httpHeaders.getRequestHeaders(), info);
+        return produceText(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info);
     }
 
     public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) {
@@ -375,7 +387,7 @@ public class TikaResource {
     @Consumes("*/*")
     @Produces("text/html")
     public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
-        return produceOutput(is, httpHeaders.getRequestHeaders(), info, "html");
+        return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "html");
     }
 
     @POST
@@ -390,7 +402,7 @@ public class TikaResource {
     @Consumes("*/*")
     @Produces("text/xml")
     public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
-        return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
+        return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "xml");
     }
 
     private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders,

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index 8ee516e..383af98 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -17,6 +17,8 @@
 
 package org.apache.tika.server.resource;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import javax.ws.rs.PUT;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
@@ -63,8 +65,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 @Path("/unpack")
 public class UnpackerResource {
     public static final String TEXT_FILENAME = "__TEXT__";
@@ -93,7 +93,7 @@ public class UnpackerResource {
             @Context HttpHeaders httpHeaders,
             @Context UriInfo info
     ) throws Exception {
-        return process(is, httpHeaders, info, false);
+        return process(TikaResource.getInputStream(is, httpHeaders), httpHeaders, info, false);
     }
 
     @Path("/all{id:(/.*)?}")
@@ -104,7 +104,7 @@ public class UnpackerResource {
             @Context HttpHeaders httpHeaders,
             @Context UriInfo info
     ) throws Exception {
-        return process(is, httpHeaders, info, true);
+        return process(TikaResource.getInputStream(is, httpHeaders), httpHeaders, info, true);
     }
 
     private Map<String, byte[]> process(

http://git-wip-us.apache.org/repos/asf/tika/blob/d612aea8/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 770b678..2a09968 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -82,7 +82,9 @@ public abstract class CXFTestBase {
     @Before
     public void setUp() {
         this.tika = TikaConfig.getDefaultConfig();
-        TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT, CommonsDigester.DigestAlgorithm.MD5));
+        TikaResource.init(tika,
+                new CommonsDigester(DIGESTER_READ_LIMIT, CommonsDigester.DigestAlgorithm.MD5),
+                new DefaultInputStreamFactory());
         JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
         setUpResources(sf);
         setUpProviders(sf);