You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/09/13 13:15:00 UTC

[jira] [Commented] (PARQUET-1261) Parquet-format interns strings when reading filemetadata

    [ https://issues.apache.org/jira/browse/PARQUET-1261?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16613462#comment-16613462 ] 

ASF GitHub Bot commented on PARQUET-1261:
-----------------------------------------

robert3005 closed pull request #92: PARQUET-1261 - Remove string interning
URL: https://github.com/apache/parquet-format/pull/92
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/main/java/org/apache/parquet/format/InterningProtocol.java b/src/main/java/org/apache/parquet/format/InterningProtocol.java
deleted file mode 100644
index 6297f1c1..00000000
--- a/src/main/java/org/apache/parquet/format/InterningProtocol.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.parquet.format;
-
-import java.nio.ByteBuffer;
-
-import org.apache.thrift.TException;
-import org.apache.thrift.protocol.TField;
-import org.apache.thrift.protocol.TList;
-import org.apache.thrift.protocol.TMap;
-import org.apache.thrift.protocol.TMessage;
-import org.apache.thrift.protocol.TProtocol;
-import org.apache.thrift.protocol.TSet;
-import org.apache.thrift.protocol.TStruct;
-import org.apache.thrift.transport.TTransport;
-
-/**
- * TProtocol that interns the strings.
- *
- * @author Julien Le Dem
- *
- */
-public class InterningProtocol extends TProtocol {
-
-  private final TProtocol delegate;
-
-  public InterningProtocol(TProtocol delegate) {
-    super(delegate.getTransport());
-    this.delegate = delegate;
-  }
-
-  public TTransport getTransport() {
-    return delegate.getTransport();
-  }
-
-  public void writeMessageBegin(TMessage message) throws TException {
-    delegate.writeMessageBegin(message);
-  }
-
-  public void writeMessageEnd() throws TException {
-    delegate.writeMessageEnd();
-  }
-
-  public int hashCode() {
-    return delegate.hashCode();
-  }
-
-  public void writeStructBegin(TStruct struct) throws TException {
-    delegate.writeStructBegin(struct);
-  }
-
-  public void writeStructEnd() throws TException {
-    delegate.writeStructEnd();
-  }
-
-  public void writeFieldBegin(TField field) throws TException {
-    delegate.writeFieldBegin(field);
-  }
-
-  public void writeFieldEnd() throws TException {
-    delegate.writeFieldEnd();
-  }
-
-  public void writeFieldStop() throws TException {
-    delegate.writeFieldStop();
-  }
-
-  public void writeMapBegin(TMap map) throws TException {
-    delegate.writeMapBegin(map);
-  }
-
-  public void writeMapEnd() throws TException {
-    delegate.writeMapEnd();
-  }
-
-  public void writeListBegin(TList list) throws TException {
-    delegate.writeListBegin(list);
-  }
-
-  public void writeListEnd() throws TException {
-    delegate.writeListEnd();
-  }
-
-  public void writeSetBegin(TSet set) throws TException {
-    delegate.writeSetBegin(set);
-  }
-
-  public void writeSetEnd() throws TException {
-    delegate.writeSetEnd();
-  }
-
-  public void writeBool(boolean b) throws TException {
-    delegate.writeBool(b);
-  }
-
-  public void writeByte(byte b) throws TException {
-    delegate.writeByte(b);
-  }
-
-  public void writeI16(short i16) throws TException {
-    delegate.writeI16(i16);
-  }
-
-  public void writeI32(int i32) throws TException {
-    delegate.writeI32(i32);
-  }
-
-  public void writeI64(long i64) throws TException {
-    delegate.writeI64(i64);
-  }
-
-  public void writeDouble(double dub) throws TException {
-    delegate.writeDouble(dub);
-  }
-
-  public void writeString(String str) throws TException {
-    delegate.writeString(str);
-  }
-
-  public void writeBinary(ByteBuffer buf) throws TException {
-    delegate.writeBinary(buf);
-  }
-
-  public TMessage readMessageBegin() throws TException {
-    return delegate.readMessageBegin();
-  }
-
-  public void readMessageEnd() throws TException {
-    delegate.readMessageEnd();
-  }
-
-  public TStruct readStructBegin() throws TException {
-    return delegate.readStructBegin();
-  }
-
-  public void readStructEnd() throws TException {
-    delegate.readStructEnd();
-  }
-
-  public TField readFieldBegin() throws TException {
-    return delegate.readFieldBegin();
-  }
-
-  public void readFieldEnd() throws TException {
-    delegate.readFieldEnd();
-  }
-
-  public TMap readMapBegin() throws TException {
-    return delegate.readMapBegin();
-  }
-
-  public void readMapEnd() throws TException {
-    delegate.readMapEnd();
-  }
-
-  public TList readListBegin() throws TException {
-    return delegate.readListBegin();
-  }
-
-  public void readListEnd() throws TException {
-    delegate.readListEnd();
-  }
-
-  public TSet readSetBegin() throws TException {
-    return delegate.readSetBegin();
-  }
-
-  public void readSetEnd() throws TException {
-    delegate.readSetEnd();
-  }
-
-  public boolean equals(Object obj) {
-    return delegate.equals(obj);
-  }
-
-  public boolean readBool() throws TException {
-    return delegate.readBool();
-  }
-
-  public byte readByte() throws TException {
-    return delegate.readByte();
-  }
-
-  public short readI16() throws TException {
-    return delegate.readI16();
-  }
-
-  public int readI32() throws TException {
-    return delegate.readI32();
-  }
-
-  public long readI64() throws TException {
-    return delegate.readI64();
-  }
-
-  public double readDouble() throws TException {
-    return delegate.readDouble();
-  }
-
-  public String readString() throws TException {
-    // this is where we intern the strings
-    return delegate.readString().intern();
-  }
-
-  public ByteBuffer readBinary() throws TException {
-    return delegate.readBinary();
-  }
-
-  public void reset() {
-    delegate.reset();
-  }
-
-  public String toString() {
-    return delegate.toString();
-  }
-
-}
diff --git a/src/main/java/org/apache/parquet/format/Util.java b/src/main/java/org/apache/parquet/format/Util.java
index 55d61ff4..14320d51 100644
--- a/src/main/java/org/apache/parquet/format/Util.java
+++ b/src/main/java/org/apache/parquet/format/Util.java
@@ -220,8 +220,8 @@ private static TProtocol protocol(InputStream from) {
     return protocol(new TIOStreamTransport(from));
   }
 
-  private static InterningProtocol protocol(TIOStreamTransport t) {
-    return new InterningProtocol(new TCompactProtocol(t));
+  private static TProtocol protocol(TIOStreamTransport t) {
+    return new TCompactProtocol(t);
   }
 
   private static <T extends TBase<?,?>> T read(InputStream from, T tbase) throws IOException {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Parquet-format interns strings when reading filemetadata
> --------------------------------------------------------
>
>                 Key: PARQUET-1261
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1261
>             Project: Parquet
>          Issue Type: Bug
>    Affects Versions: 1.9.0
>            Reporter: Robert Kruszewski
>            Assignee: Robert Kruszewski
>            Priority: Major
>              Labels: pull-request-available
>
> Parquet-format when deserializing metadata will intern strings. References I could find suggested that it had been done to reduce memory pressure early on. Java (and jvm in particular) went a long way since then and interning is generally discouraged, see [https://shipilev.net/jvm-anatomy-park/10-string-intern/] for a good explanation. What is more since java 8 there's string deduplication implemented at GC level per [http://openjdk.java.net/jeps/192.] During our usage and testing we found the interning to cause significant gc pressure for long running applications due to bigger GC root set.
> This issue proposes removing interning given it's questionable whether it should be used in modern jvms.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)