You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ra...@apache.org on 2019/05/19 15:39:55 UTC

[carbondata] branch master updated: [CARBONDATA-3367][CARBONDATA-3368] Fix multiple issues in SDK reader

This is an automated email from the ASF dual-hosted git repository.

ravipesala pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git


The following commit(s) were added to refs/heads/master by this push:
     new bd1d774  [CARBONDATA-3367][CARBONDATA-3368] Fix multiple issues in SDK reader
bd1d774 is described below

commit bd1d7745c1f62caedddbc519afaffd354e535b62
Author: ajantha-bhat <aj...@gmail.com>
AuthorDate: Wed Mar 6 16:44:52 2019 +0800

    [CARBONDATA-3367][CARBONDATA-3368] Fix multiple issues in SDK reader
    
    Problem:
    [CARBONDATA-3367] OOM when huge number of carbondata files are read from SDK reader
    
    Cause:
    Currently, for each carbondata file, one CarbonRecordReader will be created. And list of CarbonRecordReader will be maintained in carbonReader. so even when CarbonRecordReader is closed, the GC will not happen for that reader as list is still referring that object.
    so, each CarbonRecordReader needs separate memory , instead of reusing the previous memory.
    
    Solution : Once CarbonRecordReader.close is done, remove it from the list
    
    problem:
    [CARBONDATA-3368]InferSchema from datafile instead of index file
    
    cause:
    problem : In SDK, when multiple readers were created with same folder location with different file list, for inferschema all the readers refers same index file, which was causing bottle neck and JVM crash in case of JNI call.
    
    solution: Inferschema from the data file mentioned while building the reader.
    
    problem :
    Support list interface for projection, when SDK is called from other languages, JNI interface supports only list from other languages. so need to add list interface for projections.
    
    This closes #3197
---
 .../core/metadata/schema/table/CarbonTable.java    | 43 ++--------------------
 .../apache/carbondata/sdk/file/CarbonReader.java   |  7 +++-
 .../carbondata/sdk/file/CarbonReaderBuilder.java   | 15 ++++++++
 3 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/CarbonTable.java b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/CarbonTable.java
index c66d1fc..f9ba6f5 100644
--- a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/CarbonTable.java
+++ b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/CarbonTable.java
@@ -37,8 +37,6 @@ import org.apache.carbondata.core.datamap.DataMapStoreManager;
 import org.apache.carbondata.core.datamap.TableDataMap;
 import org.apache.carbondata.core.datamap.dev.DataMapFactory;
 import org.apache.carbondata.core.datastore.block.SegmentProperties;
-import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
-import org.apache.carbondata.core.datastore.impl.FileFactory;
 import org.apache.carbondata.core.features.TableOperation;
 import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
 import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
@@ -252,12 +250,9 @@ public class CarbonTable implements Serializable {
       String tableName,
       Configuration configuration) throws IOException {
     TableInfo tableInfoInfer = CarbonUtil.buildDummyTableInfo(tablePath, "null", "null");
-    CarbonFile carbonFile = getLatestIndexFile(FileFactory.getCarbonFile(tablePath, configuration));
-    if (carbonFile == null) {
-      throw new RuntimeException("Carbon index file not exists.");
-    }
-    org.apache.carbondata.format.TableInfo tableInfo = CarbonUtil
-        .inferSchemaFromIndexFile(carbonFile.getPath(), tableName);
+    // InferSchema from data file
+    org.apache.carbondata.format.TableInfo tableInfo =
+        CarbonUtil.inferSchema(tablePath, tableName, false, configuration);
     List<ColumnSchema> columnSchemaList = new ArrayList<ColumnSchema>();
     for (org.apache.carbondata.format.ColumnSchema thriftColumnSchema : tableInfo
         .getFact_table().getTable_columns()) {
@@ -271,38 +266,6 @@ public class CarbonTable implements Serializable {
     return CarbonTable.buildFromTableInfo(tableInfoInfer);
   }
 
-  private static CarbonFile getLatestIndexFile(CarbonFile tablePath) {
-    CarbonFile[] carbonFiles = tablePath.listFiles();
-    CarbonFile latestCarbonIndexFile = null;
-    long latestIndexFileTimestamp = 0L;
-    for (CarbonFile carbonFile : carbonFiles) {
-      if (carbonFile.getName().endsWith(CarbonTablePath.INDEX_FILE_EXT)
-          && carbonFile.getLastModifiedTime() > latestIndexFileTimestamp) {
-        latestCarbonIndexFile = carbonFile;
-        latestIndexFileTimestamp = carbonFile.getLastModifiedTime();
-      } else if (carbonFile.isDirectory()) {
-        // if the list has directories that doesn't contain index files,
-        // continue checking other files/directories in the list.
-        if (getLatestIndexFile(carbonFile) == null) {
-          continue;
-        } else {
-          return getLatestIndexFile(carbonFile);
-        }
-      }
-    }
-    if (latestCarbonIndexFile != null) {
-      return latestCarbonIndexFile;
-    } else {
-      // returning null only if the path doesn't have index files.
-      return null;
-    }
-  }
-
-  public static CarbonTable buildDummyTable(String tablePath) throws IOException {
-    TableInfo tableInfoInfer = CarbonUtil.buildDummyTableInfo(tablePath, "null", "null");
-    return CarbonTable.buildFromTableInfo(tableInfoInfer);
-  }
-
   public static CarbonTable buildFromTablePath(String tableName, String dbName, String tablePath,
       String tableId) throws IOException {
     return SchemaReader.readCarbonTableFromStore(
diff --git a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java
index e5c0680..1fa4e62 100644
--- a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java
+++ b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java
@@ -77,9 +77,14 @@ public class CarbonReader<T> {
         // no more readers
         return false;
       } else {
-        index++;
         // current reader is closed
         currentReader.close();
+        // no need to keep a reference to CarbonVectorizedRecordReader,
+        // until all the readers are processed.
+        // If readers count is very high,
+        // we get OOM as GC not happened for any of the content in CarbonVectorizedRecordReader
+        readers.set(index,null);
+        index++;
         currentReader = readers.get(index);
         return currentReader.nextKeyValue();
       }
diff --git a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReaderBuilder.java b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReaderBuilder.java
index ad9a383..d35957a 100644
--- a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReaderBuilder.java
+++ b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReaderBuilder.java
@@ -85,6 +85,21 @@ public class CarbonReaderBuilder {
   }
 
   /**
+   * Accepts projection list
+   *
+   * @param projectionColumnNames
+   * @return
+   */
+  public CarbonReaderBuilder projection(List<String> projectionColumnNames) {
+    Objects.requireNonNull(projectionColumnNames);
+    String[] strings = new String[projectionColumnNames.size()];
+    for (int i = 0; i < projectionColumnNames.size(); i++) {
+      strings[i] = projectionColumnNames.get(i);
+    }
+    return projection(strings);
+  }
+
+  /**
    * Configure the filter expression for carbon reader
    *
    * @param filterExpression filter expression