You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2019/09/25 16:02:25 UTC

[carbondata] branch master updated: [CARBONDATA-3523] Store data file size into index file

This is an automated email from the ASF dual-hosted git repository.

jackylk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git


The following commit(s) were added to refs/heads/master by this push:
     new 64a574e  [CARBONDATA-3523] Store data file size into index file
64a574e is described below

commit 64a574ecf144bde446396c1404f31cc39d4e5810
Author: QiangCai <qi...@qq.com>
AuthorDate: Tue Aug 13 10:25:31 2019 +0800

    [CARBONDATA-3523] Store data file size into index file
    
    In BlockIndex, the file_size is always zero. We can set the actual value during data loading and use it during the query to improve the query performance.
    
    1. avoid invoking listFiles for each segment
    2. avoid invoking getFileStatus for each data file
    
    This closes #3356
---
 .../core/datastore/block/TableBlockInfo.java           | 13 +++++++++++++
 .../carbondata/core/metadata/index/BlockIndexInfo.java | 18 ++++++++++++++++++
 .../core/util/AbstractDataFileFooterConverter.java     |  3 +++
 .../carbondata/core/util/BlockletDataMapUtil.java      | 17 ++++++++++++++---
 .../carbondata/core/util/CarbonMetadataUtil.java       |  1 +
 .../store/writer/AbstractFactDataWriter.java           |  5 +++--
 .../store/writer/v3/CarbonFactDataWriterImplV3.java    | 18 ++++++++++--------
 7 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/block/TableBlockInfo.java b/core/src/main/java/org/apache/carbondata/core/datastore/block/TableBlockInfo.java
index 25d82f8..4dd1403 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/block/TableBlockInfo.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/block/TableBlockInfo.java
@@ -54,6 +54,11 @@ public class TableBlockInfo implements Distributable, Serializable {
   private String filePath;
 
   /**
+   * file size of the block
+   */
+  private long fileSize;
+
+  /**
    * block offset in the file
    */
   private long blockOffset;
@@ -439,6 +444,14 @@ public class TableBlockInfo implements Distributable, Serializable {
     this.filePath = filePath;
   }
 
+  public long getFileSize() {
+    return fileSize;
+  }
+
+  public void setFileSize(long fileSize) {
+    this.fileSize = fileSize;
+  }
+
   public BlockletDetailInfo getDetailInfo() {
     return detailInfo;
   }
diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/index/BlockIndexInfo.java b/core/src/main/java/org/apache/carbondata/core/metadata/index/BlockIndexInfo.java
index ae99ed8..f7f2d3c 100644
--- a/core/src/main/java/org/apache/carbondata/core/metadata/index/BlockIndexInfo.java
+++ b/core/src/main/java/org/apache/carbondata/core/metadata/index/BlockIndexInfo.java
@@ -51,6 +51,11 @@ public class BlockIndexInfo {
   private BlockletInfo blockletInfo;
 
   /**
+   * file size
+   */
+  private long fileSize;
+
+  /**
    * Constructor
    *
    * @param numberOfRows  number of rows
@@ -80,6 +85,12 @@ public class BlockIndexInfo {
     this.blockletInfo = blockletInfo;
   }
 
+  public BlockIndexInfo(long numberOfRows, String fileName, long offset,
+      BlockletIndex blockletIndex, BlockletInfo blockletInfo, long fileSize) {
+    this(numberOfRows, fileName, offset, blockletIndex, blockletInfo);
+    this.fileSize = fileSize;
+  }
+
   /**
    * @return the numberOfRows
    */
@@ -114,4 +125,11 @@ public class BlockIndexInfo {
   public BlockletInfo getBlockletInfo() {
     return blockletInfo;
   }
+
+  /**
+   * @return file size
+   */
+  public long getFileSize() {
+    return fileSize;
+  }
 }
diff --git a/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java b/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java
index 64d30c2..f16a3ae 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java
@@ -244,6 +244,9 @@ public abstract class AbstractDataFileFooterConverter {
     }
     fileName = (CarbonCommonConstants.FILE_SEPARATOR + fileName).replaceAll("//", "/");
     tableBlockInfo.setFilePath(parentPath + fileName);
+    if (readBlockIndexInfo.isSetFile_size()) {
+      tableBlockInfo.setFileSize(readBlockIndexInfo.getFile_size());
+    }
     return tableBlockInfo;
   }
 
diff --git a/core/src/main/java/org/apache/carbondata/core/util/BlockletDataMapUtil.java b/core/src/main/java/org/apache/carbondata/core/util/BlockletDataMapUtil.java
index 6cd60a2..5a988c4 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/BlockletDataMapUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/BlockletDataMapUtil.java
@@ -38,9 +38,11 @@ import org.apache.carbondata.common.logging.LogServiceFactory;
 import org.apache.carbondata.core.constants.CarbonCommonConstants;
 import org.apache.carbondata.core.datamap.Segment;
 import org.apache.carbondata.core.datastore.block.SegmentProperties;
+import org.apache.carbondata.core.datastore.block.TableBlockInfo;
 import org.apache.carbondata.core.datastore.compression.CompressorFactory;
 import org.apache.carbondata.core.datastore.filesystem.AbstractDFSCarbonFile;
 import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
+import org.apache.carbondata.core.datastore.filesystem.S3CarbonFile;
 import org.apache.carbondata.core.datastore.impl.FileFactory;
 import org.apache.carbondata.core.indexstore.BlockMetaInfo;
 import org.apache.carbondata.core.indexstore.TableBlockIndexUniqueIdentifier;
@@ -124,7 +126,8 @@ public class BlockletDataMapUtil {
       }
       String blockPath = footer.getBlockInfo().getTableBlockInfo().getFilePath();
       if (null == blockMetaInfoMap.get(blockPath)) {
-        BlockMetaInfo blockMetaInfo = createBlockMetaInfo(fileNameToMetaInfoMapping, blockPath);
+        BlockMetaInfo blockMetaInfo = createBlockMetaInfo(
+            fileNameToMetaInfoMapping, footer.getBlockInfo().getTableBlockInfo());
         // if blockMetaInfo is null that means the file has been deleted from the file system.
         // This can happen in case IUD scenarios where after deleting or updating the data the
         // complete block is deleted but the entry still exists in index or merge index file
@@ -148,7 +151,7 @@ public class BlockletDataMapUtil {
       String segmentFilePath, Configuration configuration) throws IOException {
     Map<String, BlockMetaInfo> fileNameToMetaInfoMapping = new TreeMap();
     CarbonFile carbonFile = FileFactory.getCarbonFile(segmentFilePath, configuration);
-    if (carbonFile instanceof AbstractDFSCarbonFile) {
+    if (carbonFile instanceof AbstractDFSCarbonFile && !(carbonFile instanceof S3CarbonFile)) {
       PathFilter pathFilter = new PathFilter() {
         @Override public boolean accept(Path path) {
           return CarbonTablePath.isCarbonDataFile(path.getName());
@@ -166,11 +169,19 @@ public class BlockletDataMapUtil {
   }
 
   private static BlockMetaInfo createBlockMetaInfo(
-      Map<String, BlockMetaInfo> fileNameToMetaInfoMapping, String carbonDataFile)
+      Map<String, BlockMetaInfo> fileNameToMetaInfoMapping, TableBlockInfo blockInfo)
       throws IOException {
+    String carbonDataFile = blockInfo.getFilePath();
     FileFactory.FileType fileType = FileFactory.getFileType(carbonDataFile);
     switch (fileType) {
+      case S3:
       case LOCAL:
+        // consider backward compatibility
+        // when the file size in blockInfo is not zero, use this file size in blockInfo.
+        if (blockInfo.getFileSize() != 0) {
+          return new BlockMetaInfo(new String[] { "localhost" }, blockInfo.getFileSize());
+        }
+        // when the file size in blockInfo is zero, get the size of this file.
         if (!FileFactory.isFileExist(carbonDataFile)) {
           return null;
         }
diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
index 7414ab7..5777f4d 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
@@ -392,6 +392,7 @@ public class CarbonMetadataUtil {
       if (blockIndexInfo.getBlockletInfo() != null) {
         blockIndex.setBlocklet_info(getBlocletInfo3(blockIndexInfo.getBlockletInfo()));
       }
+      blockIndex.setFile_size(blockIndexInfo.getFileSize());
       thriftBlockIndexList.add(blockIndex);
     }
     return thriftBlockIndexList;
diff --git a/processing/src/main/java/org/apache/carbondata/processing/store/writer/AbstractFactDataWriter.java b/processing/src/main/java/org/apache/carbondata/processing/store/writer/AbstractFactDataWriter.java
index eb1b15d..fbe5953 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/store/writer/AbstractFactDataWriter.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/store/writer/AbstractFactDataWriter.java
@@ -365,10 +365,11 @@ public abstract class AbstractFactDataWriter implements CarbonFactDataWriter {
    *
    * @param numberOfRows    number of rows in file
    * @param carbonDataFileName The name of carbonData file
-   * @param currentPosition current offset
+   * @param footerOffset footer offset
+   * @param fileSize
    */
   protected abstract void fillBlockIndexInfoDetails(long numberOfRows, String carbonDataFileName,
-      long currentPosition);
+      long footerOffset, long fileSize);
 
   public static List<org.apache.carbondata.format.ColumnSchema> getColumnSchemaListAndCardinality(
       List<Integer> cardinality, int[] dictionaryColumnCardinality,
diff --git a/processing/src/main/java/org/apache/carbondata/processing/store/writer/v3/CarbonFactDataWriterImplV3.java b/processing/src/main/java/org/apache/carbondata/processing/store/writer/v3/CarbonFactDataWriterImplV3.java
index cac0e8b..c8673a0 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/store/writer/v3/CarbonFactDataWriterImplV3.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/store/writer/v3/CarbonFactDataWriterImplV3.java
@@ -101,7 +101,7 @@ public class CarbonFactDataWriterImplV3 extends AbstractFactDataWriter {
   protected void writeFooterToFile() throws CarbonDataWriterException {
     try {
       // get the current file position
-      long currentPosition = currentOffsetInFile;
+      long footerOffset = currentOffsetInFile;
       // get thrift file footer instance
       FileFooter3 convertFileMeta = CarbonMetadataUtil
           .convertFileFooterVersion3(blockletMetadata, blockletIndex, localCardinality,
@@ -116,16 +116,17 @@ public class CarbonFactDataWriterImplV3 extends AbstractFactDataWriter {
       convertFileMeta.putToExtra_info(CarbonCommonConstants.CARBON_WRITTEN_BY_FOOTER_INFO, appName);
       convertFileMeta.putToExtra_info(CarbonCommonConstants.CARBON_WRITTEN_VERSION,
           CarbonVersionConstants.CARBONDATA_VERSION);
-      // fill the carbon index details
-      fillBlockIndexInfoDetails(convertFileMeta.getNum_rows(), carbonDataFileName, currentPosition);
       // write the footer
       byte[] byteArray = CarbonUtil.getByteArray(convertFileMeta);
       ByteBuffer buffer =
           ByteBuffer.allocate(byteArray.length + CarbonCommonConstants.LONG_SIZE_IN_BYTE);
       buffer.put(byteArray);
-      buffer.putLong(currentPosition);
+      buffer.putLong(footerOffset);
       buffer.flip();
       currentOffsetInFile += fileChannel.write(buffer);
+      // fill the carbon index details
+      fillBlockIndexInfoDetails(
+          convertFileMeta.getNum_rows(), carbonDataFileName, footerOffset, currentOffsetInFile);
     } catch (IOException e) {
       LOGGER.error("Problem while writing the carbon file", e);
       throw new CarbonDataWriterException("Problem while writing the carbon file: ", e);
@@ -346,11 +347,12 @@ public class CarbonFactDataWriterImplV3 extends AbstractFactDataWriter {
    *
    * @param numberOfRows       number of rows in file
    * @param carbonDataFileName The name of carbonData file
-   * @param currentPosition    current offset
+   * @param footerOffset       footer offset
+   * @param fileSize           file size
    */
   @Override
   protected void fillBlockIndexInfoDetails(long numberOfRows, String carbonDataFileName,
-      long currentPosition) {
+      long footerOffset, long fileSize) {
     int i = 0;
     DataFileFooterConverterV3 converterV3 = new DataFileFooterConverterV3();
     for (org.apache.carbondata.format.BlockletIndex index : blockletIndex) {
@@ -367,8 +369,8 @@ public class CarbonFactDataWriterImplV3 extends AbstractFactDataWriter {
           new org.apache.carbondata.core.metadata.blocklet.index.BlockletIndex(bTreeIndex,
               minMaxIndex);
       BlockIndexInfo biInfo =
-          new BlockIndexInfo(numberOfRows, carbonDataFileName, currentPosition, bIndex,
-              blockletInfo);
+          new BlockIndexInfo(numberOfRows, carbonDataFileName, footerOffset, bIndex,
+              blockletInfo, fileSize);
       blockIndexInfoList.add(biInfo);
       i++;
     }