You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2018/06/05 19:29:59 UTC

[2/5] jena git commit: JENA-1554: Add bz2 compression/decompression

JENA-1554: Add bz2 compression/decompression

Add Snappy
  default 32k block
  decompress only; compressor not available

Update javadoc (RDFLanguages, BinRDF) that mentions gz.


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/f88fbc57
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/f88fbc57
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/f88fbc57

Branch: refs/heads/master
Commit: f88fbc578d02ed8925104bf5d4a03795470d9275
Parents: eb9ba39
Author: Andy Seaborne <an...@apache.org>
Authored: Sun Jun 3 10:11:13 2018 +0100
Committer: Andy Seaborne <an...@apache.org>
Committed: Sun Jun 3 10:11:13 2018 +0100

----------------------------------------------------------------------
 .../java/org/apache/jena/riot/RDFLanguages.java |  6 +--
 .../org/apache/jena/riot/thrift/BinRDF.java     |  5 +-
 .../main/java/org/apache/jena/atlas/io/IO.java  | 49 ++++++++++++-----
 .../java/org/apache/jena/atlas/io/TS_IO.java    |  1 +
 .../jena/atlas/io/TestFilenameExtensions.java   | 56 ++++++++++++++++++++
 5 files changed, 97 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/f88fbc57/jena-arq/src/main/java/org/apache/jena/riot/RDFLanguages.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/RDFLanguages.java b/jena-arq/src/main/java/org/apache/jena/riot/RDFLanguages.java
index cdfb6a5..d3f5c08 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/RDFLanguages.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/RDFLanguages.java
@@ -45,6 +45,7 @@ import static org.apache.jena.riot.WebContent.contentTypeTurtleAlt2;
 
 import java.util.*;
 
+import org.apache.jena.atlas.io.IO;
 import org.apache.jena.atlas.logging.Log ;
 import org.apache.jena.atlas.web.ContentType ;
 import org.apache.jena.atlas.web.MediaType ;
@@ -409,9 +410,8 @@ public class RDFLanguages
         int iHash = filename.indexOf('#');
         if ( iHash  > 0 )
             filename = filename.substring(0, iHash);
-        // Gzip compressed?
-        if ( filename.endsWith(".gz") )
-            filename = filename.substring(0, filename.length()-3);
+        // Gzip or BZip2 compressed?
+        filename = IO.filenameNoCompression(filename);
         return fileExtToLang(FileUtils.getFilenameExt(filename));
     }
 

http://git-wip-us.apache.org/repos/asf/jena/blob/f88fbc57/jena-arq/src/main/java/org/apache/jena/riot/thrift/BinRDF.java
----------------------------------------------------------------------
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/thrift/BinRDF.java b/jena-arq/src/main/java/org/apache/jena/riot/thrift/BinRDF.java
index 96e4ea6..7da523d 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/thrift/BinRDF.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/thrift/BinRDF.java
@@ -62,8 +62,8 @@ public class BinRDF {
     }
     
     /** 
-     * Create an {@link StreamRDF} for output.  A filename ending {@code .gz} will have
-     * a gzip compressor added to the output path. A filename of "-" is {@code System.out}.
+     * Create an {@link StreamRDF} for output.  A filenames ending {@code .gz} or {@code .bz2} will have
+     * the respective compressor added to the output path. A filename of "-" is {@code System.out}.
      * The file is closed when {@link StreamRDF#finish()} is called unless it is {@code System.out}.  
      * Call {@link StreamRDF#start()}...{@link StreamRDF#finish()}.
      * 
@@ -73,7 +73,6 @@ public class BinRDF {
      */
     public static StreamRDF streamToFile(String filename, boolean withValues) {
         OutputStream out = IO.openOutputFile(filename) ;
-        // Is this internally buffered as well?
         BufferedOutputStream bout = new BufferedOutputStream(out, BUFSIZE_OUT) ;
         TProtocol protocol = TRDF.protocol(bout) ;
         return new StreamRDF2Thrift(protocol, withValues) ;

http://git-wip-us.apache.org/repos/asf/jena/blob/f88fbc57/jena-base/src/main/java/org/apache/jena/atlas/io/IO.java
----------------------------------------------------------------------
diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/IO.java b/jena-base/src/main/java/org/apache/jena/atlas/io/IO.java
index fea37ac..3a74913 100644
--- a/jena-base/src/main/java/org/apache/jena/atlas/io/IO.java
+++ b/jena-base/src/main/java/org/apache/jena/atlas/io/IO.java
@@ -24,7 +24,11 @@ import java.nio.charset.StandardCharsets ;
 import java.util.zip.GZIPInputStream ;
 import java.util.zip.GZIPOutputStream ;
 
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
+import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
 import org.apache.jena.atlas.RuntimeIOException ;
+import org.apache.jena.atlas.lib.FileOps;
 import org.apache.jena.atlas.lib.IRILib ;
 
 public class IO
@@ -63,7 +67,7 @@ public class IO
     
     /** Open an input stream to a file; do not mask IOExceptions. 
      * If the filename is null or "-", return System.in
-     * If the filename ends in .gz, wrap in  GZIPInputStream  
+     * If the filename ends in .gz, wrap in GZIPInputStream  
      * @param filename
      * @throws FileNotFoundException 
      * @throws IOException
@@ -77,10 +81,28 @@ public class IO
             filename = IRILib.decode(filename) ;
         }
         InputStream in = new FileInputStream(filename) ;
-        if ( filename.endsWith(".gz") )
-            in = new GZIPInputStream(in) ;
+        String ext = FileOps.extension(filename);
+        switch ( ext ) {
+            case "":        return in;
+            case "gz":      return new GZIPInputStream(in) ;
+            case "bz2":     return new BZip2CompressorInputStream(in);
+            case "sz":      return new SnappyCompressorInputStream(in);
+        }
         return in ;
     }
+
+    private static String[] extensions = { ".gz", ".bz2", ".sz" }; 
+    
+    /** The filename without any compression extension, or the original filename.
+     *  It tests for compression types handled by {@link #openFileEx}.
+     */
+    static public String filenameNoCompression(String filename) {
+        for ( String ext : extensions ) {
+            if ( filename.endsWith(ext) )
+                return filename.substring(0, filename.length()-ext.length());
+        }
+        return filename;
+    }
     
     /** Open a UTF8 Reader for a file. 
      * If the filename is null or "-", use System.in
@@ -134,11 +156,8 @@ public class IO
     }
 
     /** Open a file for output - may include adding gzip processing. */
-    static public OutputStream openOutputFile(String filename)
-    {
-        try {
-           return openOutputFileEx(filename) ;
-        }
+    static public OutputStream openOutputFile(String filename) {
+        try { return openOutputFileEx(filename) ; }
         catch (IOException ex) { IO.exception(ex) ; return null ; }
     }
     
@@ -158,15 +177,18 @@ public class IO
             filename = IRILib.decode(filename) ;
         }
         OutputStream out = new FileOutputStream(filename) ;
-        if ( filename.endsWith(".gz") )
-            out = new GZIPOutputStream(out) ;
+        String ext = FileOps.extension(filename);
+        switch ( ext ) {
+            case "":        return out;
+            case "gz":      return new GZIPOutputStream(out) ;
+            case "bz2":     return new BZip2CompressorOutputStream(out);
+            case "sz":      throw new UnsupportedOperationException("Snappy output");
+        }
         return out ;
     }
     
     /** Wrap in a general writer interface */ 
-    static public AWriter wrap(Writer w) { 
-        return Writer2.wrap(w) ;
-    }
+    static public AWriter wrap(Writer w)                    { return Writer2.wrap(w) ; }
     
     /** Wrap in a general writer interface */ 
     static public AWriter wrapUTF8(OutputStream out)        { return wrap(asUTF8(out)) ; } 
@@ -343,5 +365,4 @@ public class IO
             return null ;
         }
     }
-
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/f88fbc57/jena-base/src/test/java/org/apache/jena/atlas/io/TS_IO.java
----------------------------------------------------------------------
diff --git a/jena-base/src/test/java/org/apache/jena/atlas/io/TS_IO.java b/jena-base/src/test/java/org/apache/jena/atlas/io/TS_IO.java
index 4479243..085cbf3 100644
--- a/jena-base/src/test/java/org/apache/jena/atlas/io/TS_IO.java
+++ b/jena-base/src/test/java/org/apache/jena/atlas/io/TS_IO.java
@@ -37,6 +37,7 @@ import org.junit.runners.Suite ;
     // Writers
     , TestBufferingWriter.class
     // Other
+    , TestFilenameExtensions.class
     , TestPrintUtils.class
 } )
 public class TS_IO

http://git-wip-us.apache.org/repos/asf/jena/blob/f88fbc57/jena-base/src/test/java/org/apache/jena/atlas/io/TestFilenameExtensions.java
----------------------------------------------------------------------
diff --git a/jena-base/src/test/java/org/apache/jena/atlas/io/TestFilenameExtensions.java b/jena-base/src/test/java/org/apache/jena/atlas/io/TestFilenameExtensions.java
new file mode 100644
index 0000000..edb6848
--- /dev/null
+++ b/jena-base/src/test/java/org/apache/jena/atlas/io/TestFilenameExtensions.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.atlas.io;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class TestFilenameExtensions {
+
+    @Test public void ext_1() { 
+        String fn1 = "file.txt";
+        String fn2 = IO.filenameNoCompression(fn1);
+        assertEquals(fn1, fn2);
+    }
+    
+    @Test public void ext_2() { 
+        String fn1 = "a/b/file.gz";
+        String fn2 = IO.filenameNoCompression(fn1);
+        assertEquals("a/b/file", fn2);
+    }
+
+    @Test public void ext_3() { 
+        String fn1 = "file.ttl.bz2";
+        String fn2 = IO.filenameNoCompression(fn1);
+        assertEquals("file.ttl", fn2);
+    }
+    
+    @Test public void ext_4() { 
+        String fn1 = "file.txt.gz";
+        String fn2 = IO.filenameNoCompression(fn1);
+        assertEquals("file.txt", fn2);
+    }
+
+    @Test public void ext_5() { 
+        String fn1 = "a/b/file.ttl.bz2";
+        String fn2 = IO.filenameNoCompression(fn1);
+        assertEquals("a/b/file.ttl", fn2);
+    }
+}