You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hawq.apache.org by weinan003 <gi...@git.apache.org> on 2018/03/30 02:30:01 UTC
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
GitHub user weinan003 opened a pull request:
https://github.com/apache/incubator-hawq/pull/1350
HAWQ-1600. Parquet table data vectorized scan
You can merge this pull request into a Git repository by running:
$ git pull https://github.com/weinan003/incubator-hawq 1600
Alternatively you can review and apply these changes as the patch at:
https://github.com/apache/incubator-hawq/pull/1350.patch
To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:
This closes #1350
----
commit 7664584c8f58eff651ffa8992bdc8c1a370c974d
Author: Weinan Wang <we...@...>
Date: 2018-03-30T02:05:56Z
HAWQ-1600. Parquet table data vectorized scan
----
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by wengyanqing <gi...@git.apache.org>.
Github user wengyanqing commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178502483
--- Diff: contrib/vexecutor/vcheck.h ---
@@ -39,6 +39,7 @@ typedef struct VectorizedState
{
bool vectorized;
PlanState *parent;
+ bool* proj;
--- End diff --
Because this variable is only related to ao table, or has different content based on different table format. It's better to rename it to a more readable name, aoprojs or just use (void * opaque).
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178484042
--- Diff: contrib/vexecutor/parquet_reader.c ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "parquet_reader.h"
+
+#include "executor/executor.h"
+#include "tuplebatch.h"
+#include "vcheck.h"
+
+extern bool getNextRowGroup(ParquetScanDesc scan);
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot);
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot);
+
+TupleTableSlot *
+ParquetVScanNext(ScanState *scanState)
+{
+ Assert(IsA(scanState, TableScanState) || IsA(scanState, DynamicTableScanState));
+ ParquetScanState *node = (ParquetScanState *)scanState;
+ Assert(node->opaque != NULL && node->opaque->scandesc != NULL);
+
+ parquet_vgetnext(node->opaque->scandesc, node->ss.ps.state->es_direction, node->ss.ss_ScanTupleSlot);
+ return node->ss.ss_ScanTupleSlot;
+}
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
+{
+
+ //AOTupleId aoTupleId;
+ Assert(ScanDirectionIsForward(direction));
+
+ for(;;)
+ {
+ if(scan->bufferDone)
+ {
+ /*
+ * Get the next row group. We call this function until we
+ * successfully get a block to process, or finished reading
+ * all the data (all 'segment' files) for this relation.
+ */
+ while(!getNextRowGroup(scan))
+ {
+ /* have we read all this relation's data. done! */
+ if(scan->pqs_done_all_splits)
+ {
+ ExecClearTuple(slot);
+ return /*NULL*/;
+ }
+ }
+ scan->bufferDone = false;
+ }
+
+ int row_num = ParquetRowGroupReader_ScanNextTupleBatch(
+ scan->pqs_tupDesc,
+ &scan->rowGroupReader,
+ scan->hawqAttrToParquetColChunks,
+ scan->proj,
+ slot);
+ if(row_num > 0)
+ return;
+
+ /* no more items in the row group, get new buffer */
+ scan->bufferDone = true;
+ }
+}
+
+/*
+ * Get next tuple batch from current row group into slot.
+ *
+ * Return false if current row group has no tuple left, true otherwise.
--- End diff --
According the comments, this function returns true or false, but at last it returns a number of rows. If the function returns a number, it should not return false when it finish scan row group, use "0" instead, since there is no rows. If this function return a bool, it should not return a number.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483689
--- Diff: contrib/vexecutor/vcheck.h ---
@@ -37,6 +39,7 @@ typedef struct VectorizedState
{
bool vectorized;
PlanState *parent;
+ bool* proj;
--- End diff --
Please fix indent here.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by wengyanqing <gi...@git.apache.org>.
Github user wengyanqing commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178484642
--- Diff: contrib/vexecutor/execVScan.c ---
@@ -38,8 +40,8 @@ getVScanMethod(int tableType)
},
//PARQUETSCAN
{
- &ParquetScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
- &ReScanParquetRelation, &MarkRestrNotAllowed, &MarkRestrNotAllowed
+ &ParquetVScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
+ NULL,NULL,NULL
--- End diff --
So we need to understand what's the different between MarkRestrNotAllowed and NULL, is it possible to cause crash if setting NULL ?
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by wengyanqing <gi...@git.apache.org>.
Github user wengyanqing commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178459027
--- Diff: contrib/vexecutor/execVScan.c ---
@@ -148,7 +181,10 @@ ExecVScan(ScanState *node, ExecScanAccessMtd accessMtd)
* Form a projection tuple, store it in the result tuple slot
* and return it.
*/
- return ExecProject(projInfo, NULL);
+ ((TupleBatch)projInfo->pi_slot->PRIVATE_tb)->nrows = ((TupleBatch)slot->PRIVATE_tb)->nrows;
+ memcpy(((TupleBatch)projInfo->pi_slot->PRIVATE_tb)->skip,
--- End diff --
If possible, avoid memory copy during tuple batch processing.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483643
--- Diff: contrib/vexecutor/parquet_reader.c ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "parquet_reader.h"
+
+#include "executor/executor.h"
+#include "tuplebatch.h"
+#include "vcheck.h"
+
+extern bool getNextRowGroup(ParquetScanDesc scan);
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot);
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot);
+
+TupleTableSlot *
+ParquetVScanNext(ScanState *scanState)
+{
+ Assert(IsA(scanState, TableScanState) || IsA(scanState, DynamicTableScanState));
+ ParquetScanState *node = (ParquetScanState *)scanState;
+ Assert(node->opaque != NULL && node->opaque->scandesc != NULL);
+
+ parquet_vgetnext(node->opaque->scandesc, node->ss.ps.state->es_direction, node->ss.ss_ScanTupleSlot);
+ return node->ss.ss_ScanTupleSlot;
+}
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
+{
+
+ //AOTupleId aoTupleId;
+ Assert(ScanDirectionIsForward(direction));
+
+ for(;;)
+ {
+ if(scan->bufferDone)
+ {
+ /*
+ * Get the next row group. We call this function until we
+ * successfully get a block to process, or finished reading
+ * all the data (all 'segment' files) for this relation.
+ */
+ while(!getNextRowGroup(scan))
+ {
+ /* have we read all this relation's data. done! */
+ if(scan->pqs_done_all_splits)
+ {
+ ExecClearTuple(slot);
+ return /*NULL*/;
+ }
+ }
+ scan->bufferDone = false;
+ }
+
+ int row_num = ParquetRowGroupReader_ScanNextTupleBatch(
+ scan->pqs_tupDesc,
+ &scan->rowGroupReader,
+ scan->hawqAttrToParquetColChunks,
+ scan->proj,
+ slot);
+ if(row_num > 0)
+ return;
+
+ /* no more items in the row group, get new buffer */
+ scan->bufferDone = true;
+ }
+}
+
+/*
+ * Get next tuple batch from current row group into slot.
+ *
+ * Return false if current row group has no tuple left, true otherwise.
+ */
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot)
+{
+ Assert(slot);
+
+ if (rowGroupReader->rowRead >= rowGroupReader->rowCount)
+ {
+ ParquetRowGroupReader_FinishedScanRowGroup(rowGroupReader);
+ return false;
+ }
+
+ /*
+ * get the next item (tuple) from the row group
+ */
+ int ncol = slot->tts_tupleDescriptor->natts;
+ TupleBatch tb = (TupleBatch )slot->PRIVATE_tb;
+
+ tb->nrows = 0;
+ if (rowGroupReader->rowRead + tb->batchsize > rowGroupReader->rowCount) {
+ tb->nrows = rowGroupReader->rowCount-rowGroupReader->rowRead;
+ rowGroupReader->rowRead = rowGroupReader->rowCount;
+ }
+ else {
+ tb->nrows = tb->batchsize ;
+ rowGroupReader->rowRead += tb->batchsize;
+ }
+
+ int colReaderIndex = 0;
+ for(int i = 0; i < tb->ncols ; i++)
+ {
+ if(projs[i] == false)
+ continue;
+
+ Oid hawqTypeID = tupDesc->attrs[i]->atttypid;
+ Oid hawqVTypeID = GetVtype(hawqTypeID);
+ if(!tb->datagroup[i])
+ tbCreateColumn(tb,i,hawqVTypeID);
+
+ vheader* header = tb->datagroup[i];
+ header->dim = tb->nrows;
--- End diff --
Please fix indent here.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by weinan003 <gi...@git.apache.org>.
Github user weinan003 commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178480561
--- Diff: contrib/vexecutor/execVQual.c ---
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "execVQual.h"
+
+/*
+ * ExecVariableList
+ * Evaluates a simple-Variable-list projection.
+ *
+ * Results are stored into the passed values and isnull arrays.
+ */
+static void
+ExecVecVariableList(ProjectionInfo *projInfo,
+ Datum *values)
+{
+ ExprContext *econtext = projInfo->pi_exprContext;
+ int *varSlotOffsets = projInfo->pi_varSlotOffsets;
+ int *varNumbers = projInfo->pi_varNumbers;
+ TupleBatch tb = (TupleBatch) values;
+ int i;
+ tb->ncols = list_length(projInfo->pi_targetlist);
+
+ /*
+ * Assign to result by direct extraction of fields from source slots ... a
+ * mite ugly, but fast ...
+ */
+ for (i = list_length(projInfo->pi_targetlist) - 1; i >= 0; i--)
+ {
+ char *slotptr = ((char *) econtext) + varSlotOffsets[i];
+ TupleTableSlot *varSlot = *((TupleTableSlot **) slotptr);
+ int varNumber = varNumbers[i] - 1;
+ tb->datagroup[i] = ((TupleBatch)varSlot->PRIVATE_tb)->datagroup[varNumber];
+ }
+}
+
+TupleTableSlot *
+ExecVProject(ProjectionInfo *projInfo, ExprDoneCond *isDone)
+{
+ TupleTableSlot *slot;
+ Assert(projInfo != NULL);
+
+ /*
+ * get the projection info we want
+ */
+ slot = projInfo->pi_slot;
+
+ /*
+ * Clear any former contents of the result slot. This makes it safe for
+ * us to use the slot's Datum/isnull arrays as workspace. (Also, we can
+ * return the slot as-is if we decide no rows can be projected.)
+ */
+ ExecClearTuple(slot);
+
+ /*
+ * form a new result tuple (if possible); if successful, mark the result
+ * slot as containing a valid virtual tuple
+ */
+ if (projInfo->pi_isVarList)
+ {
+ /* simple Var list: this always succeeds with one result row */
+ if (isDone)
+ *isDone = ExprSingleResult;
+
+ ExecVecVariableList(projInfo,slot->PRIVATE_tb);
+ ExecStoreVirtualTuple(slot);
+ }
+ else
+ {
+ elog(FATAL,"does not support expression in projection stmt");
--- End diff --
The expression op will be supported soon in next
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483545
--- Diff: contrib/vexecutor/ao_reader.c ---
@@ -0,0 +1,78 @@
+#include "ao_reader.h"
+#include "tuplebatch.h"
+#include "utils/datum.h"
+
+
+void
+BeginVScanAppendOnlyRelation(ScanState *scanState)
+{
+ BeginScanAppendOnlyRelation(scanState);
+ VectorizedState* vs = (VectorizedState*)scanState->ps.vectorized;
+ TupleBatch tb = scanState->ss_ScanTupleSlot->PRIVATE_tb;
+ vs->proj = palloc0(sizeof(bool) * tb->ncols);
+ GetNeededColumnsForScan((Node* )scanState->ps.plan->targetlist,vs->proj,tb->ncols);
+ GetNeededColumnsForScan((Node* )scanState->ps.plan->qual,vs->proj,tb->ncols);
+
+}
+
+void
+EndVScanAppendOnlyRelation(ScanState *scanState)
+{
+ VectorizedState* vs = (VectorizedState*)scanState->ps.vectorized;
+ pfree(vs->proj);
+ EndScanAppendOnlyRelation(scanState);
+}
+
+TupleTableSlot *
+AppendOnlyVScanNext(ScanState *scanState)
+{
+ TupleTableSlot *slot = scanState->ss_ScanTupleSlot;
+ TupleBatch tb = (TupleBatch)slot->PRIVATE_tb;
+ TupleDesc td = scanState->ss_ScanTupleSlot->tts_tupleDescriptor;
+ VectorizedState* vs = scanState->ps.vectorized;
+ int row = 0;
+
+ for(;row < tb->batchsize;row ++)
+ {
+ AppendOnlyScanNext(scanState);
+
+ slot = scanState->ss_ScanTupleSlot;
+ if(TupIsNull(slot))
+ break;
+
+ for(int i = 0;i < tb->ncols ; i ++)
+ {
+
+ if(vs->proj[i])
+ {
--- End diff --
redundant space here
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by weinan003 <gi...@git.apache.org>.
Github user weinan003 commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178480017
--- Diff: contrib/vexecutor/execVScan.c ---
@@ -38,8 +40,8 @@ getVScanMethod(int tableType)
},
//PARQUETSCAN
{
- &ParquetScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
- &ReScanParquetRelation, &MarkRestrNotAllowed, &MarkRestrNotAllowed
+ &ParquetVScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
+ NULL,NULL,NULL
--- End diff --
At present, we do not know what kinds of query can cover these function.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by wengyanqing <gi...@git.apache.org>.
Github user wengyanqing commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178458812
--- Diff: contrib/vexecutor/execVScan.c ---
@@ -38,8 +40,8 @@ getVScanMethod(int tableType)
},
//PARQUETSCAN
{
- &ParquetScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
- &ReScanParquetRelation, &MarkRestrNotAllowed, &MarkRestrNotAllowed
+ &ParquetVScanNext, &BeginScanParquetRelation, &EndScanParquetRelation,
+ NULL,NULL,NULL
--- End diff --
Why these function pointers are set NULLs ?
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483636
--- Diff: contrib/vexecutor/parquet_reader.c ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "parquet_reader.h"
+
+#include "executor/executor.h"
+#include "tuplebatch.h"
+#include "vcheck.h"
+
+extern bool getNextRowGroup(ParquetScanDesc scan);
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot);
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot);
+
+TupleTableSlot *
+ParquetVScanNext(ScanState *scanState)
+{
+ Assert(IsA(scanState, TableScanState) || IsA(scanState, DynamicTableScanState));
+ ParquetScanState *node = (ParquetScanState *)scanState;
+ Assert(node->opaque != NULL && node->opaque->scandesc != NULL);
+
+ parquet_vgetnext(node->opaque->scandesc, node->ss.ps.state->es_direction, node->ss.ss_ScanTupleSlot);
+ return node->ss.ss_ScanTupleSlot;
+}
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
+{
+
+ //AOTupleId aoTupleId;
+ Assert(ScanDirectionIsForward(direction));
+
+ for(;;)
+ {
+ if(scan->bufferDone)
+ {
+ /*
+ * Get the next row group. We call this function until we
+ * successfully get a block to process, or finished reading
+ * all the data (all 'segment' files) for this relation.
+ */
+ while(!getNextRowGroup(scan))
+ {
+ /* have we read all this relation's data. done! */
+ if(scan->pqs_done_all_splits)
+ {
+ ExecClearTuple(slot);
+ return /*NULL*/;
+ }
+ }
+ scan->bufferDone = false;
+ }
+
+ int row_num = ParquetRowGroupReader_ScanNextTupleBatch(
+ scan->pqs_tupDesc,
+ &scan->rowGroupReader,
+ scan->hawqAttrToParquetColChunks,
+ scan->proj,
+ slot);
+ if(row_num > 0)
+ return;
+
+ /* no more items in the row group, get new buffer */
+ scan->bufferDone = true;
+ }
+}
+
+/*
+ * Get next tuple batch from current row group into slot.
+ *
+ * Return false if current row group has no tuple left, true otherwise.
+ */
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot)
+{
+ Assert(slot);
+
+ if (rowGroupReader->rowRead >= rowGroupReader->rowCount)
+ {
+ ParquetRowGroupReader_FinishedScanRowGroup(rowGroupReader);
+ return false;
+ }
+
+ /*
+ * get the next item (tuple) from the row group
+ */
+ int ncol = slot->tts_tupleDescriptor->natts;
+ TupleBatch tb = (TupleBatch )slot->PRIVATE_tb;
+
+ tb->nrows = 0;
+ if (rowGroupReader->rowRead + tb->batchsize > rowGroupReader->rowCount) {
+ tb->nrows = rowGroupReader->rowCount-rowGroupReader->rowRead;
+ rowGroupReader->rowRead = rowGroupReader->rowCount;
+ }
+ else {
+ tb->nrows = tb->batchsize ;
+ rowGroupReader->rowRead += tb->batchsize;
+ }
+
+ int colReaderIndex = 0;
+ for(int i = 0; i < tb->ncols ; i++)
+ {
+ if(projs[i] == false)
+ continue;
+
+ Oid hawqTypeID = tupDesc->attrs[i]->atttypid;
+ Oid hawqVTypeID = GetVtype(hawqTypeID);
--- End diff --
Please fix indent here.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483630
--- Diff: contrib/vexecutor/parquet_reader.c ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "parquet_reader.h"
+
+#include "executor/executor.h"
+#include "tuplebatch.h"
+#include "vcheck.h"
+
+extern bool getNextRowGroup(ParquetScanDesc scan);
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot);
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot);
+
+TupleTableSlot *
+ParquetVScanNext(ScanState *scanState)
+{
+ Assert(IsA(scanState, TableScanState) || IsA(scanState, DynamicTableScanState));
+ ParquetScanState *node = (ParquetScanState *)scanState;
+ Assert(node->opaque != NULL && node->opaque->scandesc != NULL);
+
+ parquet_vgetnext(node->opaque->scandesc, node->ss.ps.state->es_direction, node->ss.ss_ScanTupleSlot);
+ return node->ss.ss_ScanTupleSlot;
+}
+
+static void
+parquet_vgetnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
+{
+
+ //AOTupleId aoTupleId;
+ Assert(ScanDirectionIsForward(direction));
+
+ for(;;)
+ {
+ if(scan->bufferDone)
+ {
+ /*
+ * Get the next row group. We call this function until we
+ * successfully get a block to process, or finished reading
+ * all the data (all 'segment' files) for this relation.
+ */
+ while(!getNextRowGroup(scan))
+ {
+ /* have we read all this relation's data. done! */
+ if(scan->pqs_done_all_splits)
+ {
+ ExecClearTuple(slot);
+ return /*NULL*/;
+ }
+ }
+ scan->bufferDone = false;
+ }
+
+ int row_num = ParquetRowGroupReader_ScanNextTupleBatch(
+ scan->pqs_tupDesc,
+ &scan->rowGroupReader,
+ scan->hawqAttrToParquetColChunks,
+ scan->proj,
+ slot);
+ if(row_num > 0)
+ return;
+
+ /* no more items in the row group, get new buffer */
+ scan->bufferDone = true;
+ }
+}
+
+/*
+ * Get next tuple batch from current row group into slot.
+ *
+ * Return false if current row group has no tuple left, true otherwise.
+ */
+static int
+ParquetRowGroupReader_ScanNextTupleBatch(
+ TupleDesc tupDesc,
+ ParquetRowGroupReader *rowGroupReader,
+ int *hawqAttrToParquetColNum,
+ bool *projs,
+ TupleTableSlot *slot)
+{
+ Assert(slot);
+
+ if (rowGroupReader->rowRead >= rowGroupReader->rowCount)
+ {
+ ParquetRowGroupReader_FinishedScanRowGroup(rowGroupReader);
+ return false;
+ }
+
+ /*
+ * get the next item (tuple) from the row group
+ */
+ int ncol = slot->tts_tupleDescriptor->natts;
+ TupleBatch tb = (TupleBatch )slot->PRIVATE_tb;
--- End diff --
Please fix indent here.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by interma <gi...@git.apache.org>.
Github user interma commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178484798
--- Diff: contrib/vexecutor/Makefile ---
@@ -17,7 +17,7 @@
MODULE_big = vexecutor
-OBJS = vexecutor.o vadt.o vcheck.o tuplebatch.o execVScan.o
+OBJS = vexecutor.o vadt.o vcheck.o tuplebatch.o execVScan.o execVQual.o parquet_reader.o ao_reader.o
--- End diff --
Should add some feature tests for ao & parquet scan.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by weinan003 <gi...@git.apache.org>.
Github user weinan003 closed the pull request at:
https://github.com/apache/incubator-hawq/pull/1350
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by wengyanqing <gi...@git.apache.org>.
Github user wengyanqing commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178459158
--- Diff: contrib/vexecutor/execVQual.c ---
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "execVQual.h"
+
+/*
+ * ExecVariableList
+ * Evaluates a simple-Variable-list projection.
+ *
+ * Results are stored into the passed values and isnull arrays.
+ */
+static void
+ExecVecVariableList(ProjectionInfo *projInfo,
+ Datum *values)
+{
+ ExprContext *econtext = projInfo->pi_exprContext;
+ int *varSlotOffsets = projInfo->pi_varSlotOffsets;
+ int *varNumbers = projInfo->pi_varNumbers;
+ TupleBatch tb = (TupleBatch) values;
+ int i;
+ tb->ncols = list_length(projInfo->pi_targetlist);
+
+ /*
+ * Assign to result by direct extraction of fields from source slots ... a
+ * mite ugly, but fast ...
+ */
+ for (i = list_length(projInfo->pi_targetlist) - 1; i >= 0; i--)
+ {
+ char *slotptr = ((char *) econtext) + varSlotOffsets[i];
+ TupleTableSlot *varSlot = *((TupleTableSlot **) slotptr);
+ int varNumber = varNumbers[i] - 1;
+ tb->datagroup[i] = ((TupleBatch)varSlot->PRIVATE_tb)->datagroup[varNumber];
+ }
+}
+
+TupleTableSlot *
+ExecVProject(ProjectionInfo *projInfo, ExprDoneCond *isDone)
+{
+ TupleTableSlot *slot;
+ Assert(projInfo != NULL);
+
+ /*
+ * get the projection info we want
+ */
+ slot = projInfo->pi_slot;
+
+ /*
+ * Clear any former contents of the result slot. This makes it safe for
+ * us to use the slot's Datum/isnull arrays as workspace. (Also, we can
+ * return the slot as-is if we decide no rows can be projected.)
+ */
+ ExecClearTuple(slot);
+
+ /*
+ * form a new result tuple (if possible); if successful, mark the result
+ * slot as containing a valid virtual tuple
+ */
+ if (projInfo->pi_isVarList)
+ {
+ /* simple Var list: this always succeeds with one result row */
+ if (isDone)
+ *isDone = ExprSingleResult;
+
+ ExecVecVariableList(projInfo,slot->PRIVATE_tb);
+ ExecStoreVirtualTuple(slot);
+ }
+ else
+ {
+ elog(FATAL,"does not support expression in projection stmt");
--- End diff --
Any unsupport operation in vectorized execution, it should be check before execution and fallback to original executor. Must make sure it could get right query results.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483577
--- Diff: contrib/vexecutor/ao_reader.c ---
@@ -0,0 +1,78 @@
+#include "ao_reader.h"
+#include "tuplebatch.h"
+#include "utils/datum.h"
+
+
+void
+BeginVScanAppendOnlyRelation(ScanState *scanState)
+{
+ BeginScanAppendOnlyRelation(scanState);
+ VectorizedState* vs = (VectorizedState*)scanState->ps.vectorized;
+ TupleBatch tb = scanState->ss_ScanTupleSlot->PRIVATE_tb;
+ vs->proj = palloc0(sizeof(bool) * tb->ncols);
+ GetNeededColumnsForScan((Node* )scanState->ps.plan->targetlist,vs->proj,tb->ncols);
+ GetNeededColumnsForScan((Node* )scanState->ps.plan->qual,vs->proj,tb->ncols);
+
+}
+
+void
+EndVScanAppendOnlyRelation(ScanState *scanState)
+{
+ VectorizedState* vs = (VectorizedState*)scanState->ps.vectorized;
+ pfree(vs->proj);
+ EndScanAppendOnlyRelation(scanState);
+}
+
+TupleTableSlot *
+AppendOnlyVScanNext(ScanState *scanState)
+{
+ TupleTableSlot *slot = scanState->ss_ScanTupleSlot;
+ TupleBatch tb = (TupleBatch)slot->PRIVATE_tb;
+ TupleDesc td = scanState->ss_ScanTupleSlot->tts_tupleDescriptor;
+ VectorizedState* vs = scanState->ps.vectorized;
+ int row = 0;
+
+ for(;row < tb->batchsize;row ++)
+ {
+ AppendOnlyScanNext(scanState);
+
+ slot = scanState->ss_ScanTupleSlot;
+ if(TupIsNull(slot))
+ break;
+
+ for(int i = 0;i < tb->ncols ; i ++)
+ {
+
+ if(vs->proj[i])
+ {
+ Oid hawqTypeID = slot->tts_tupleDescriptor->attrs[i]->atttypid;
+ Oid hawqVTypeID = GetVtype(hawqTypeID);
+ if(!tb->datagroup[i])
+ tbCreateColumn(tb,i,hawqVTypeID);
+
+ Datum *ptr = GetVFunc(hawqVTypeID)->gettypeptr(tb->datagroup[i],row);
+ *ptr = slot_getattr(slot,i + 1, &(tb->datagroup[i]->isnull[row]));
+
+ /* if attribute is a reference, deep copy the data out to prevent ao table buffer free before vectorized scan batch done */
+ if(!slot->tts_mt_bind->tupdesc->attrs[i]->attbyval)
+ *ptr = datumCopy(*ptr,slot->tts_mt_bind->tupdesc->attrs[i]->attbyval,slot->tts_mt_bind->tupdesc->attrs[i]->attlen);
+ }
+ }
+
+ AppendOnlyScanDesc scanDesc = ((AppendOnlyScanState*)scanState)->aos_ScanDesc;
+ VarBlockHeader *header = scanDesc->executorReadBlock.varBlockReader.header;
+
+ //if(row + 1 == VarBlockGet_itemCount(header))
--- End diff --
Please remove useless codes.
---
[GitHub] incubator-hawq pull request #1350: HAWQ-1600. Parquet table data vectorized ...
Posted by linwen <gi...@git.apache.org>.
Github user linwen commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1350#discussion_r178483664
--- Diff: contrib/vexecutor/vcheck.h ---
@@ -27,6 +27,8 @@ typedef struct vFuncMap
Oid ntype;
vheader* (* vtbuild)(int n);
void (* vtfree)(vheader **vh);
+ Datum (* gettypeptr)(vheader *vh,int n);
--- End diff --
Please fix indent here.
---