You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/10/22 08:07:07 UTC

[09/13] lucene-solr:jira/http2: SOLR-10981: Support for stream.url or stream.file pointing to gzipped data

SOLR-10981: Support for stream.url or stream.file pointing to gzipped data


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1a8188d9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1a8188d9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1a8188d9

Branch: refs/heads/jira/http2
Commit: 1a8188d92b8148f2d937bd038f48f103526fcbcc
Parents: fd91648
Author: Andrew Lundgren <lu...@byu.net>
Authored: Thu Oct 18 19:53:21 2018 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Thu Oct 18 19:53:21 2018 -0400

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   3 +
 solr/solr-ref-guide/src/content-streams.adoc    |   4 +
 .../solr/common/util/ContentStreamBase.java     | 104 +++++++---
 .../solr/common/util/ContentStreamTest.java     | 196 +++++++++++++------
 4 files changed, 229 insertions(+), 78 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 746f349..cfc9d3a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -209,6 +209,9 @@ Improvements
 
 * SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
 
+* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data.  It's detected by either a content
+  encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
+
 ==================  7.5.0 ==================
 
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solr-ref-guide/src/content-streams.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/content-streams.adoc b/solr/solr-ref-guide/src/content-streams.adoc
index 17c1997..fa579bf 100644
--- a/solr/solr-ref-guide/src/content-streams.adoc
+++ b/solr/solr-ref-guide/src/content-streams.adoc
@@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
 If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
 ====
 
+The source of the data can be compressed using gzip, and Solr will generally detect this.
+The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
+Gzip doesn't apply to `stream.body`.
+
 == Debugging Requests
 
 The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
index f757c60..c2da9af 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
@@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Locale;
+import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;
 
+import org.apache.http.entity.ContentType;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.request.RequestWriter;
 
@@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
  */
 public abstract class ContentStreamBase implements ContentStream
 {
+
   public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
-  
+  private static final String TEXT_CSV = "text/csv";
+  private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
+  private static final List<String> XML_SUF =  Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
+  private static final List<String> JSON_SUF =  Arrays.asList(".json", ".json.gz", ".json.gzip");
+  private static final List<String> CSV_SUF =  Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
+
   protected String name;
   protected String sourceInfo;
   protected String contentType;
@@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
     }
     return null;
   }
-  
+
+  protected String attemptToDetermineContentType() {
+    String type = null;
+    if (name != null) {
+      Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
+
+      if (XML_SUF.stream().anyMatch(endsWith)) {
+        type = ContentType.APPLICATION_XML.getMimeType();
+      } else if (JSON_SUF.stream().anyMatch(endsWith)) {
+        type = ContentType.APPLICATION_JSON.getMimeType();
+      } else if (CSV_SUF.stream().anyMatch(endsWith)) {
+        type = TEXT_CSV;
+      } else {
+        type = attemptToDetermineTypeFromFirstCharacter();
+      }
+    }
+    return type;
+  }
+
+  private String attemptToDetermineTypeFromFirstCharacter() {
+    String type = null;
+    try (InputStream stream = getStream()) {
+      // Last ditch effort to determine content, if the first non-white space
+      // is a '<' or '{', assume xml or json.
+      int data = stream.read();
+      while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
+        data = stream.read();
+      }
+      if ((char)data == '<') {
+        type = ContentType.APPLICATION_XML.getMimeType();
+      } else if ((char)data == '{') {
+        type = ContentType.APPLICATION_JSON.getMimeType();
+      }
+    } catch (Exception ex) {
+      // This code just eats, the exception and leaves
+      // the contentType untouched.
+    }
+    return type;
+  }
+
   //------------------------------------------------------------------------
   //------------------------------------------------------------------------
   
@@ -82,13 +132,32 @@ public abstract class ContentStreamBase implements ContentStream
     }
 
     @Override
+    public String getContentType() {
+      // for file:// streams that are octet-streams, try to determine the payload
+      // type from payload rather than just using the mime type.
+      if ("file".equals(url.getProtocol())) {
+        Predicate<String> equals = mimeType->mimeType.equals(contentType);
+        if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
+          String type = attemptToDetermineContentType();
+          contentType = ( type != null ) ? type : contentType;
+        }
+      }
+      return contentType;
+    }
+
+    @Override
     public InputStream getStream() throws IOException {
       URLConnection conn = this.url.openConnection();
       
       contentType = conn.getContentType();
       name = url.toExternalForm();
-      size = (long) conn.getContentLength();
-      return conn.getInputStream();
+      size = conn.getContentLengthLong();
+      InputStream is = conn.getInputStream();
+      String urlFile = url.getFile().toLowerCase(Locale.ROOT);
+      if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
+        is = new GZIPInputStream(is);
+      }
+      return is;
     }
   }
   
@@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
     @Override
     public String getContentType() {
       if(contentType==null) {
-        // TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
-        InputStream stream = null;
-        try {
-          stream = new FileInputStream(file);
-          char first = (char)stream.read();
-          if(first == '<') {
-            return "application/xml";
-          }
-          if(first == '{') {
-            return "application/json";
-          }
-        } catch(Exception ex) {
-        } finally {
-          if (stream != null) try {
-            stream.close();
-          } catch (IOException ioe) {}
-        }
+        contentType = attemptToDetermineContentType();
       }
       return contentType;
     }
 
     @Override
     public InputStream getStream() throws IOException {
-      return new FileInputStream( file );
+      InputStream is = new FileInputStream( file );
+      String lowerName = name.toLowerCase(Locale.ROOT);
+      if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
+        is = new GZIPInputStream(is);
+      }
+      return is;
     }
   }
   
@@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
     }
     
     public ByteArrayStream( byte[] bytes, String source, String contentType ) {
-      this.bytes = bytes; 
+      this.bytes = bytes;
       
       this.contentType = contentType;
       name = source;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
index 58996e4..2fc4f9a 100644
--- a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
+++ b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
@@ -25,88 +25,174 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.core.SolrResourceLoader;
 
 /**
+ * Tests {@link ContentStream} such as "stream.file".
  */
-public class ContentStreamTest extends SolrTestCaseJ4 
-{  
-  public void testStringStream() throws IOException 
-  {
+public class ContentStreamTest extends SolrTestCaseJ4 {
+
+  public void testStringStream() throws IOException {
     String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
-    ContentStreamBase stream = new ContentStreamBase.StringStream( input );
-    assertEquals( input.length(), stream.getSize().intValue() );
-    assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
-    assertEquals( input, IOUtils.toString( stream.getReader() ) );
+    ContentStreamBase stream = new ContentStreamBase.StringStream(input);
+    assertEquals(input.length(), stream.getSize().intValue());
+    assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
+    assertEquals(input, IOUtils.toString(stream.getReader()));
   }
 
-  public void testFileStream() throws IOException 
-  {
-    File file = null;
-    try (SolrResourceLoader loader = new SolrResourceLoader();
-         InputStream is = loader.openResource( "solrj/README" )) {
+  public void testFileStream() throws IOException {
+    File file = new File(createTempDir().toFile(), "README");
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file)) {
       assertNotNull(is);
-      file = new File(createTempDir().toFile(), "README");
-      try (FileOutputStream os = new FileOutputStream(file)) {
-        IOUtils.copy(is, os);
-      }
+      IOUtils.copy(is, os);
     }
 
     ContentStreamBase stream = new ContentStreamBase.FileStream(file);
-    InputStream s = stream.getStream();
-    FileInputStream fis = new FileInputStream(file);
-    InputStreamReader isr = new InputStreamReader(
-        new FileInputStream(file), StandardCharsets.UTF_8);
-    Reader r = stream.getReader();
-    try {
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         InputStreamReader isr = new InputStreamReader(
+             new FileInputStream(file), StandardCharsets.UTF_8);
+         Reader r = stream.getReader()) {
       assertEquals(file.length(), stream.getSize().intValue());
+      // Test the code that sets content based on < being the 1st character
+      assertEquals("application/xml", stream.getContentType());
       assertTrue(IOUtils.contentEquals(fis, s));
       assertTrue(IOUtils.contentEquals(isr, r));
-    } finally {
-      s.close();
-      r.close();
-      isr.close();
-      fis.close();
     }
   }
-  
 
-  public void testURLStream() throws IOException 
-  {
-    File file = null;
-    FileOutputStream os = null;
+  public void testFileStreamGZIP() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.gz");
 
-    try (SolrResourceLoader loader = new SolrResourceLoader();
-         InputStream is = loader.openResource( "solrj/README" )) {
-      assertNotNull(is);
-      file = new File(createTempDir().toFile(), "README");
-      os = new FileOutputStream(file);
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      assertEquals(file.length(), stream.getSize().intValue());
+      // Test the code that sets content based on < being the 1st character
+      assertEquals("application/xml", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+    }
+  }
+
+  public void testURLStream() throws IOException {
+    File file = new File(createTempDir().toFile(), "README");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file)) {
       IOUtils.copy(is, os);
-      os.close();
-      is.close();
     }
-    
-    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
-        .toURI().toASCIIString()));
-    InputStream s = stream.getStream();
-    FileInputStream fis = new FileInputStream(file);
-    FileInputStream fis2 = new FileInputStream(file);
-    InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
-    Reader r = stream.getReader();
-    try {
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         FileInputStream fis2 = new FileInputStream(file);
+         InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
+         Reader r = stream.getReader()) {
+      // For File URLs, the content type is determined automatically by the mime type
+      // associated with the file extension,
+      // This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
+      //
+      // HTTP URLS, the content type is determined by the headers.  Those are not tested here.
+      //
+      assertEquals("text/html", stream.getContentType());
       assertTrue(IOUtils.contentEquals(fis2, s));
       assertEquals(file.length(), stream.getSize().intValue());
       assertTrue(IOUtils.contentEquals(isr, r));
       assertEquals(file.length(), stream.getSize().intValue());
-    } finally {
-      r.close();
-      s.close();
-      isr.close();
-      fis.close();
-      fis2.close();
+    }
+  }
+
+  public void testURLStreamGZIP() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.gz");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("application/xml", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
+    }
+  }
+
+  public void testURLStreamCSVGZIPExtention() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.CSV.gz");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("text/csv", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
+    }
+  }
+
+  public void testURLStreamJSONGZIPExtention() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.json.gzip");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("application/json", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
     }
   }
 }