You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2021/02/08 20:17:12 UTC
[lucene-solr] branch master updated: SOLR-15142: Allow the cat
Streaming Expression to read gzip files
This is an automated email from the ASF dual-hosted git repository.
jbernste pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new da8b8ec SOLR-15142: Allow the cat Streaming Expression to read gzip files
da8b8ec is described below
commit da8b8ecdb8b7fd338bab3b442334e89b8ccf6d73
Author: Joel Bernstein <jb...@apache.org>
AuthorDate: Mon Feb 8 15:07:09 2021 -0500
SOLR-15142: Allow the cat Streaming Expression to read gzip files
---
.../java/org/apache/solr/handler/CatStream.java | 11 ++++++-
.../solrj/io/stream/StreamExpressionTest.java | 38 ++++++++++++++++++++--
2 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/handler/CatStream.java b/solr/core/src/java/org/apache/solr/handler/CatStream.java
index d7f5fe6..e6d58c5 100644
--- a/solr/core/src/java/org/apache/solr/handler/CatStream.java
+++ b/solr/core/src/java/org/apache/solr/handler/CatStream.java
@@ -17,7 +17,10 @@
package org.apache.solr.handler;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -25,6 +28,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
@@ -180,7 +184,12 @@ public class CatStream extends TupleStream implements Expressible {
while (allFilesToCrawl.hasNext()) {
closeCurrentFileIfSet();
currentFilePath = allFilesToCrawl.next();
- currentFileLines = FileUtils.lineIterator(currentFilePath.absolutePath.toFile(), "UTF-8");
+ File currentFile = currentFilePath.absolutePath.toFile();
+ if(currentFile.getName().endsWith(".gz")) {
+ currentFileLines = new LineIterator(new InputStreamReader(new GZIPInputStream(new FileInputStream(currentFile)), "UTF-8"));
+ } else {
+ currentFileLines = FileUtils.lineIterator(currentFile, "UTF-8");
+ }
if (currentFileLines.hasNext()) return true;
}
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
index 7d8a062..56769c2 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
@@ -16,9 +16,7 @@
*/
package org.apache.solr.client.solrj.io.stream;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
+import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -30,6 +28,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
+import java.util.zip.GZIPOutputStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
@@ -3484,6 +3483,28 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
@Test
+ public void testCatStreamSingleGzipFile() throws Exception {
+ final String catStream = "cat(\"topLevel1.txt.gz\")";
+ ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+ paramsLoc.set("expr", catStream);
+ paramsLoc.set("qt", "/stream");
+ String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
+
+ SolrStream solrStream = new SolrStream(url, paramsLoc);
+
+ StreamContext context = new StreamContext();
+ solrStream.setStreamContext(context);
+ List<Tuple> tuples = getTuples(solrStream);
+ assertEquals(4, tuples.size());
+
+ for (int i = 0; i < 4; i++) {
+ Tuple t = tuples.get(i);
+ assertEquals("topLevel1.txt.gz line " + String.valueOf(i+1), t.get("line"));
+ assertEquals("topLevel1.txt.gz", t.get("file"));
+ }
+ }
+
+ @Test
public void testCatStreamEmptyFile() throws Exception {
final String catStream = "cat(\"topLevel-empty.txt\")";
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
@@ -3648,6 +3669,7 @@ public class StreamExpressionTest extends SolrCloudTestCase {
Files.createDirectories(dataDir);
Files.createDirectories(dataDir.resolve("directory1"));
+ populateFileWithGzipData(dataDir.resolve("topLevel1.txt.gz"));
populateFileWithData(dataDir.resolve("topLevel1.txt"));
populateFileWithData(dataDir.resolve("topLevel2.txt"));
Files.createFile(dataDir.resolve("topLevel-empty.txt"));
@@ -3665,6 +3687,16 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
}
+ private static void populateFileWithGzipData(Path dataFile) throws Exception {
+ Files.createFile(dataFile);
+ try (final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(dataFile.toFile())), StandardCharsets.UTF_8))) {
+ for (int i = 1; i <=4; i++) {
+ writer.write(dataFile.getFileName() + " line " + i);
+ writer.newLine();
+ }
+ }
+ }
+
protected List<Tuple> getTuples(TupleStream tupleStream) throws IOException {
List<Tuple> tuples = new ArrayList<Tuple>();