You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by hu...@apache.org on 2015/12/03 08:56:49 UTC

incubator-hawq git commit: HAWQ-210. Improve data locality by calculating the insert host.

Repository: incubator-hawq
Updated Branches:
  refs/heads/master 951e9899c -> bbd7e0ae3


HAWQ-210. Improve data locality by calculating the insert host.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/bbd7e0ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/bbd7e0ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/bbd7e0ae

Branch: refs/heads/master
Commit: bbd7e0ae3cc223777554f2b38be2c0e0a6a673ec
Parents: 951e989
Author: hubertzhang <hz...@pivotal.io>
Authored: Thu Dec 3 14:56:10 2015 +0800
Committer: hubertzhang <hz...@pivotal.io>
Committed: Thu Dec 3 15:55:30 2015 +0800

----------------------------------------------------------------------
 src/backend/cdb/cdbdatalocality.c | 50 ++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bbd7e0ae/src/backend/cdb/cdbdatalocality.c
----------------------------------------------------------------------
diff --git a/src/backend/cdb/cdbdatalocality.c b/src/backend/cdb/cdbdatalocality.c
index eaf704d..6c45b9f 100644
--- a/src/backend/cdb/cdbdatalocality.c
+++ b/src/backend/cdb/cdbdatalocality.c
@@ -154,6 +154,7 @@ typedef struct Block_Host_Index {
 	int replica_num;
 	int* hostIndex; // hdfs host name list(size is replica_num)
 	Host_Index *hostIndextoSort; // used to sore host index
+	int insertHost; // the host which inserted this block.
 } Block_Host_Index;
 
 /*
@@ -1016,6 +1017,7 @@ update_data_dist_stat(split_to_segment_mapping_context *context,
 	for (i = 0; i < block_num; i++) {
 		int j;
 		hostIDs[i].replica_num = locations[i].numOfNodes;
+		hostIDs[i].insertHost = -1;
 		hostIDs[i].hostIndex = (int *) palloc(sizeof(int) * hostIDs[i].replica_num);
 		hostIDs[i].hostIndextoSort = (Host_Index *) palloc(
 				sizeof(Host_Index) * hostIDs[i].replica_num);
@@ -1603,9 +1605,14 @@ static int select_random_host_algorithm(Relation_Assignment_Context *context,
 					if (minvols > context->vols[j]) {
 						minvols = context->vols[j];
 						minindex = j;
-					} else {
-						if (context->block_lessthan_vseg_round_robin_no >= 0) {
-							if (minvols == context->vols[j] && l == context->block_lessthan_vseg_round_robin_no) {
+					} else if(minvols == context->vols[j]){
+						/*prefer insert host if exists*/
+						int inserthost = (*hostID)->insertHost;
+						if(inserthost == idMap->global_IDs[j]){
+							minindex = j;
+						}
+						else if (context->block_lessthan_vseg_round_robin_no >= 0) {
+							if (l == context->block_lessthan_vseg_round_robin_no) {
 								minindex = j;
 								context->block_lessthan_vseg_round_robin_no = (context->block_lessthan_vseg_round_robin_no+1)%(*hostID)->replica_num;
 							}
@@ -2645,6 +2652,43 @@ static void allocate_random_relation(Relation_Data* rel_data,
 	int* isBlockContinue = (int *) palloc(
 			sizeof(int) * assignment_context->virtual_segment_num);
 
+
+	/*find the insert node for each block*/
+	int *hostOccurTimes = (int *) palloc(
+						sizeof(int) * context->dds_context.size);
+	for (int fi = 0; fi < fileCount; fi++) {
+			Relation_File *rel_file = file_vector[fi];
+			/*for hash file whose bucket number doesn't equal to segment number*/
+			if (rel_file->hostIDs == NULL) {
+				rel_file->splits[0].host = 0;
+				assignment_context->total_split_num += 1;
+				continue;
+			}
+			MemSet(hostOccurTimes, 0,	sizeof(int) * context->dds_context.size);
+			for (i = 0; i < rel_file->split_num; i++) {
+				Block_Host_Index *hostID = rel_file->hostIDs + i;
+						for (int l = 0; l < hostID->replica_num; l++) {
+							uint32_t key = hostID->hostIndex[l];
+							hostOccurTimes[key]++;
+						}
+			}
+			int maxOccurTime=-1;
+			int inserthost = -1;
+			for(int i=0;i< context->dds_context.size;i++){
+				if(hostOccurTimes[i] > maxOccurTime){
+				  maxOccurTime = hostOccurTimes[i];
+				  inserthost = i;
+				}
+			}
+			/*we consider the insert hosts are the same for blocks in the same file. This logic can be changed in future*/
+			for (i = 0; i < rel_file->split_num; i++) {
+				Block_Host_Index *hostID = rel_file->hostIDs + i;
+				hostID->insertHost = inserthost;
+			}
+	}
+	pfree(hostOccurTimes);
+
+	/*three stage allocation algorithm*/
 	for (int fi = 0; fi < fileCount; fi++) {
 		Relation_File *rel_file = file_vector[fi];