You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/10/22 08:07:07 UTC
[09/13] lucene-solr:jira/http2: SOLR-10981: Support for stream.url or
stream.file pointing to gzipped data
SOLR-10981: Support for stream.url or stream.file pointing to gzipped data
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1a8188d9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1a8188d9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1a8188d9
Branch: refs/heads/jira/http2
Commit: 1a8188d92b8148f2d937bd038f48f103526fcbcc
Parents: fd91648
Author: Andrew Lundgren <lu...@byu.net>
Authored: Thu Oct 18 19:53:21 2018 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Thu Oct 18 19:53:21 2018 -0400
----------------------------------------------------------------------
solr/CHANGES.txt | 3 +
solr/solr-ref-guide/src/content-streams.adoc | 4 +
.../solr/common/util/ContentStreamBase.java | 104 +++++++---
.../solr/common/util/ContentStreamTest.java | 196 +++++++++++++------
4 files changed, 229 insertions(+), 78 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 746f349..cfc9d3a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -209,6 +209,9 @@ Improvements
* SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
+* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data. It's detected by either a content
+ encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
+
================== 7.5.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solr-ref-guide/src/content-streams.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/content-streams.adoc b/solr/solr-ref-guide/src/content-streams.adoc
index 17c1997..fa579bf 100644
--- a/solr/solr-ref-guide/src/content-streams.adoc
+++ b/solr/solr-ref-guide/src/content-streams.adoc
@@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
====
+The source of the data can be compressed using gzip, and Solr will generally detect this.
+The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
+Gzip doesn't apply to `stream.body`.
+
== Debugging Requests
The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
index f757c60..c2da9af 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
@@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
import java.util.Locale;
+import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;
+import org.apache.http.entity.ContentType;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.request.RequestWriter;
@@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
*/
public abstract class ContentStreamBase implements ContentStream
{
+
public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
-
+ private static final String TEXT_CSV = "text/csv";
+ private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
+ private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
+ private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
+ private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
+
protected String name;
protected String sourceInfo;
protected String contentType;
@@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
}
return null;
}
-
+
+ protected String attemptToDetermineContentType() {
+ String type = null;
+ if (name != null) {
+ Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
+
+ if (XML_SUF.stream().anyMatch(endsWith)) {
+ type = ContentType.APPLICATION_XML.getMimeType();
+ } else if (JSON_SUF.stream().anyMatch(endsWith)) {
+ type = ContentType.APPLICATION_JSON.getMimeType();
+ } else if (CSV_SUF.stream().anyMatch(endsWith)) {
+ type = TEXT_CSV;
+ } else {
+ type = attemptToDetermineTypeFromFirstCharacter();
+ }
+ }
+ return type;
+ }
+
+ private String attemptToDetermineTypeFromFirstCharacter() {
+ String type = null;
+ try (InputStream stream = getStream()) {
+ // Last ditch effort to determine content, if the first non-white space
+ // is a '<' or '{', assume xml or json.
+ int data = stream.read();
+ while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
+ data = stream.read();
+ }
+ if ((char)data == '<') {
+ type = ContentType.APPLICATION_XML.getMimeType();
+ } else if ((char)data == '{') {
+ type = ContentType.APPLICATION_JSON.getMimeType();
+ }
+ } catch (Exception ex) {
+ // This code just eats, the exception and leaves
+ // the contentType untouched.
+ }
+ return type;
+ }
+
//------------------------------------------------------------------------
//------------------------------------------------------------------------
@@ -82,13 +132,32 @@ public abstract class ContentStreamBase implements ContentStream
}
@Override
+ public String getContentType() {
+ // for file:// streams that are octet-streams, try to determine the payload
+ // type from payload rather than just using the mime type.
+ if ("file".equals(url.getProtocol())) {
+ Predicate<String> equals = mimeType->mimeType.equals(contentType);
+ if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
+ String type = attemptToDetermineContentType();
+ contentType = ( type != null ) ? type : contentType;
+ }
+ }
+ return contentType;
+ }
+
+ @Override
public InputStream getStream() throws IOException {
URLConnection conn = this.url.openConnection();
contentType = conn.getContentType();
name = url.toExternalForm();
- size = (long) conn.getContentLength();
- return conn.getInputStream();
+ size = conn.getContentLengthLong();
+ InputStream is = conn.getInputStream();
+ String urlFile = url.getFile().toLowerCase(Locale.ROOT);
+ if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
+ is = new GZIPInputStream(is);
+ }
+ return is;
}
}
@@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
@Override
public String getContentType() {
if(contentType==null) {
- // TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
- InputStream stream = null;
- try {
- stream = new FileInputStream(file);
- char first = (char)stream.read();
- if(first == '<') {
- return "application/xml";
- }
- if(first == '{') {
- return "application/json";
- }
- } catch(Exception ex) {
- } finally {
- if (stream != null) try {
- stream.close();
- } catch (IOException ioe) {}
- }
+ contentType = attemptToDetermineContentType();
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
- return new FileInputStream( file );
+ InputStream is = new FileInputStream( file );
+ String lowerName = name.toLowerCase(Locale.ROOT);
+ if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
+ is = new GZIPInputStream(is);
+ }
+ return is;
}
}
@@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
}
public ByteArrayStream( byte[] bytes, String source, String contentType ) {
- this.bytes = bytes;
+ this.bytes = bytes;
this.contentType = contentType;
name = source;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1a8188d9/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
index 58996e4..2fc4f9a 100644
--- a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
+++ b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
@@ -25,88 +25,174 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrResourceLoader;
/**
+ * Tests {@link ContentStream} such as "stream.file".
*/
-public class ContentStreamTest extends SolrTestCaseJ4
-{
- public void testStringStream() throws IOException
- {
+public class ContentStreamTest extends SolrTestCaseJ4 {
+
+ public void testStringStream() throws IOException {
String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
- ContentStreamBase stream = new ContentStreamBase.StringStream( input );
- assertEquals( input.length(), stream.getSize().intValue() );
- assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
- assertEquals( input, IOUtils.toString( stream.getReader() ) );
+ ContentStreamBase stream = new ContentStreamBase.StringStream(input);
+ assertEquals(input.length(), stream.getSize().intValue());
+ assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
+ assertEquals(input, IOUtils.toString(stream.getReader()));
}
- public void testFileStream() throws IOException
- {
- File file = null;
- try (SolrResourceLoader loader = new SolrResourceLoader();
- InputStream is = loader.openResource( "solrj/README" )) {
+ public void testFileStream() throws IOException {
+ File file = new File(createTempDir().toFile(), "README");
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file)) {
assertNotNull(is);
- file = new File(createTempDir().toFile(), "README");
- try (FileOutputStream os = new FileOutputStream(file)) {
- IOUtils.copy(is, os);
- }
+ IOUtils.copy(is, os);
}
ContentStreamBase stream = new ContentStreamBase.FileStream(file);
- InputStream s = stream.getStream();
- FileInputStream fis = new FileInputStream(file);
- InputStreamReader isr = new InputStreamReader(
- new FileInputStream(file), StandardCharsets.UTF_8);
- Reader r = stream.getReader();
- try {
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ InputStreamReader isr = new InputStreamReader(
+ new FileInputStream(file), StandardCharsets.UTF_8);
+ Reader r = stream.getReader()) {
assertEquals(file.length(), stream.getSize().intValue());
+ // Test the code that sets content based on < being the 1st character
+ assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis, s));
assertTrue(IOUtils.contentEquals(isr, r));
- } finally {
- s.close();
- r.close();
- isr.close();
- fis.close();
}
}
-
- public void testURLStream() throws IOException
- {
- File file = null;
- FileOutputStream os = null;
+ public void testFileStreamGZIP() throws IOException {
+ File file = new File(createTempDir().toFile(), "README.gz");
- try (SolrResourceLoader loader = new SolrResourceLoader();
- InputStream is = loader.openResource( "solrj/README" )) {
- assertNotNull(is);
- file = new File(createTempDir().toFile(), "README");
- os = new FileOutputStream(file);
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file);
+ GZIPOutputStream zos = new GZIPOutputStream(os)) {
+ IOUtils.copy(is, zos);
+ }
+
+ ContentStreamBase stream = new ContentStreamBase.FileStream(file);
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ GZIPInputStream zis = new GZIPInputStream(fis);
+ InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+ FileInputStream fis2 = new FileInputStream(file);
+ GZIPInputStream zis2 = new GZIPInputStream(fis2);
+ Reader r = stream.getReader()) {
+ assertEquals(file.length(), stream.getSize().intValue());
+ // Test the code that sets content based on < being the 1st character
+ assertEquals("application/xml", stream.getContentType());
+ assertTrue(IOUtils.contentEquals(isr, r));
+ assertTrue(IOUtils.contentEquals(zis2, s));
+ }
+ }
+
+ public void testURLStream() throws IOException {
+ File file = new File(createTempDir().toFile(), "README");
+
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file)) {
IOUtils.copy(is, os);
- os.close();
- is.close();
}
-
- ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
- .toURI().toASCIIString()));
- InputStream s = stream.getStream();
- FileInputStream fis = new FileInputStream(file);
- FileInputStream fis2 = new FileInputStream(file);
- InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
- Reader r = stream.getReader();
- try {
+
+ ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ FileInputStream fis2 = new FileInputStream(file);
+ InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
+ Reader r = stream.getReader()) {
+ // For File URLs, the content type is determined automatically by the mime type
+ // associated with the file extension,
+ // This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
+ //
+ // HTTP URLS, the content type is determined by the headers. Those are not tested here.
+ //
+ assertEquals("text/html", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis2, s));
assertEquals(file.length(), stream.getSize().intValue());
assertTrue(IOUtils.contentEquals(isr, r));
assertEquals(file.length(), stream.getSize().intValue());
- } finally {
- r.close();
- s.close();
- isr.close();
- fis.close();
- fis2.close();
+ }
+ }
+
+ public void testURLStreamGZIP() throws IOException {
+ File file = new File(createTempDir().toFile(), "README.gz");
+
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file);
+ GZIPOutputStream zos = new GZIPOutputStream(os)) {
+ IOUtils.copy(is, zos);
+ }
+
+ ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ GZIPInputStream zis = new GZIPInputStream(fis);
+ InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+ FileInputStream fis2 = new FileInputStream(file);
+ GZIPInputStream zis2 = new GZIPInputStream(fis2);
+ Reader r = stream.getReader()) {
+ // See the non-GZIP test case for an explanation of header handling.
+ assertEquals("application/xml", stream.getContentType());
+ assertTrue(IOUtils.contentEquals(isr, r));
+ assertTrue(IOUtils.contentEquals(zis2, s));
+ assertEquals(file.length(), stream.getSize().intValue());
+ }
+ }
+
+ public void testURLStreamCSVGZIPExtention() throws IOException {
+ File file = new File(createTempDir().toFile(), "README.CSV.gz");
+
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file);
+ GZIPOutputStream zos = new GZIPOutputStream(os)) {
+ IOUtils.copy(is, zos);
+ }
+
+ ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ GZIPInputStream zis = new GZIPInputStream(fis);
+ InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+ FileInputStream fis2 = new FileInputStream(file);
+ GZIPInputStream zis2 = new GZIPInputStream(fis2);
+ Reader r = stream.getReader()) {
+ // See the non-GZIP test case for an explanation of header handling.
+ assertEquals("text/csv", stream.getContentType());
+ assertTrue(IOUtils.contentEquals(isr, r));
+ assertTrue(IOUtils.contentEquals(zis2, s));
+ assertEquals(file.length(), stream.getSize().intValue());
+ }
+ }
+
+ public void testURLStreamJSONGZIPExtention() throws IOException {
+ File file = new File(createTempDir().toFile(), "README.json.gzip");
+
+ try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+ FileOutputStream os = new FileOutputStream(file);
+ GZIPOutputStream zos = new GZIPOutputStream(os)) {
+ IOUtils.copy(is, zos);
+ }
+
+ ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+ try (InputStream s = stream.getStream();
+ FileInputStream fis = new FileInputStream(file);
+ GZIPInputStream zis = new GZIPInputStream(fis);
+ InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+ FileInputStream fis2 = new FileInputStream(file);
+ GZIPInputStream zis2 = new GZIPInputStream(fis2);
+ Reader r = stream.getReader()) {
+ // See the non-GZIP test case for an explanation of header handling.
+ assertEquals("application/json", stream.getContentType());
+ assertTrue(IOUtils.contentEquals(isr, r));
+ assertTrue(IOUtils.contentEquals(zis2, s));
+ assertEquals(file.length(), stream.getSize().intValue());
}
}
}