You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2021/02/08 20:17:12 UTC

[lucene-solr] branch master updated: SOLR-15142: Allow the cat Streaming Expression to read gzip files

This is an automated email from the ASF dual-hosted git repository.

jbernste pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new da8b8ec  SOLR-15142: Allow the cat Streaming Expression to read gzip files
da8b8ec is described below

commit da8b8ecdb8b7fd338bab3b442334e89b8ccf6d73
Author: Joel Bernstein <jb...@apache.org>
AuthorDate: Mon Feb 8 15:07:09 2021 -0500

    SOLR-15142: Allow the cat Streaming Expression to read gzip files
---
 .../java/org/apache/solr/handler/CatStream.java    | 11 ++++++-
 .../solrj/io/stream/StreamExpressionTest.java      | 38 ++++++++++++++++++++--
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/handler/CatStream.java b/solr/core/src/java/org/apache/solr/handler/CatStream.java
index d7f5fe6..e6d58c5 100644
--- a/solr/core/src/java/org/apache/solr/handler/CatStream.java
+++ b/solr/core/src/java/org/apache/solr/handler/CatStream.java
@@ -17,7 +17,10 @@
 
 package org.apache.solr.handler;
 
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.lang.invoke.MethodHandles;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -25,6 +28,7 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.LineIterator;
@@ -180,7 +184,12 @@ public class CatStream extends TupleStream implements Expressible {
     while (allFilesToCrawl.hasNext()) {
       closeCurrentFileIfSet();
       currentFilePath = allFilesToCrawl.next();
-      currentFileLines = FileUtils.lineIterator(currentFilePath.absolutePath.toFile(), "UTF-8");
+      File currentFile = currentFilePath.absolutePath.toFile();
+      if(currentFile.getName().endsWith(".gz")) {
+        currentFileLines = new LineIterator(new InputStreamReader(new GZIPInputStream(new FileInputStream(currentFile)), "UTF-8"));
+      } else {
+        currentFileLines = FileUtils.lineIterator(currentFile, "UTF-8");
+      }
       if (currentFileLines.hasNext()) return true;
     }
 
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
index 7d8a062..56769c2 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java
@@ -16,9 +16,7 @@
  */
 package org.apache.solr.client.solrj.io.stream;
 
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
+import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -30,6 +28,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.zip.GZIPOutputStream;
 
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Slow;
@@ -3484,6 +3483,28 @@ public class StreamExpressionTest extends SolrCloudTestCase {
   }
 
   @Test
+  public void testCatStreamSingleGzipFile() throws Exception {
+    final String catStream = "cat(\"topLevel1.txt.gz\")";
+    ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+    paramsLoc.set("expr", catStream);
+    paramsLoc.set("qt", "/stream");
+    String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
+
+    SolrStream solrStream = new SolrStream(url, paramsLoc);
+
+    StreamContext context = new StreamContext();
+    solrStream.setStreamContext(context);
+    List<Tuple> tuples = getTuples(solrStream);
+    assertEquals(4, tuples.size());
+
+    for (int i = 0; i < 4; i++) {
+      Tuple t = tuples.get(i);
+      assertEquals("topLevel1.txt.gz line " + String.valueOf(i+1), t.get("line"));
+      assertEquals("topLevel1.txt.gz", t.get("file"));
+    }
+  }
+
+  @Test
   public void testCatStreamEmptyFile() throws Exception {
     final String catStream = "cat(\"topLevel-empty.txt\")";
     ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
@@ -3648,6 +3669,7 @@ public class StreamExpressionTest extends SolrCloudTestCase {
     Files.createDirectories(dataDir);
     Files.createDirectories(dataDir.resolve("directory1"));
 
+    populateFileWithGzipData(dataDir.resolve("topLevel1.txt.gz"));
     populateFileWithData(dataDir.resolve("topLevel1.txt"));
     populateFileWithData(dataDir.resolve("topLevel2.txt"));
     Files.createFile(dataDir.resolve("topLevel-empty.txt"));
@@ -3665,6 +3687,16 @@ public class StreamExpressionTest extends SolrCloudTestCase {
     }
   }
 
+  private static void populateFileWithGzipData(Path dataFile) throws Exception {
+    Files.createFile(dataFile);
+    try (final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(dataFile.toFile())), StandardCharsets.UTF_8))) {
+      for (int i = 1; i <=4; i++) {
+        writer.write(dataFile.getFileName() + " line " + i);
+        writer.newLine();
+      }
+    }
+  }
+
   protected List<Tuple> getTuples(TupleStream tupleStream) throws IOException {
     List<Tuple> tuples = new ArrayList<Tuple>();