You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@tajo.apache.org by "Hyoungjun Kim (JIRA)" <ji...@apache.org> on 2014/07/09 08:32:04 UTC

[jira] [Created] (TAJO-925) Child ExecutionBlock of JOIN node has different number of shuffle keys.

Hyoungjun Kim created TAJO-925:
----------------------------------

             Summary: Child ExecutionBlock of JOIN node has different number of shuffle keys.
                 Key: TAJO-925
                 URL: https://issues.apache.org/jira/browse/TAJO-925
             Project: Tajo
          Issue Type: Bug
            Reporter: Hyoungjun Kim
            Priority: Minor


If both sides of a join node is not SCAN but SUBQUERY, each node has different number shuffle keys.
In that case JOIN query returns a wrong result.  I tested with the below test code.
{code}
@Test
public void testJoinWithDifferentShuffleKey() throws Exception {
  KeyValueSet tableOptions = new KeyValueSet();
  tableOptions.put(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER);
  tableOptions.put(StorageConstants.CSVFILE_NULL, "\\\\N");

  Schema schema = new Schema();
  schema.addColumn("id", Type.INT4);
  schema.addColumn("name", Type.TEXT);

  List<String> data = new ArrayList<String>();

  int bytes = 0;
  for (int i = 0; i < 1000000; i++) {
    String row = i + "|" + i + "name012345678901234567890123456789012345678901234567890";
    bytes += row.getBytes().length;
    data.add(row);
    if (bytes > 2 * 1024 * 1024) {
      break;
    }
  }
  TajoTestingCluster.createTable("large_table", schema, tableOptions, data.toArray(new String[]{}));

  int originConfValue = conf.getIntVar(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME);
  testingCluster.setAllTajoDaemonConfValue(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME.varname, "1");
  ResultSet res = executeString(
     "select count(b.id) " +
         "from (select id, count(*) as cnt from large_table group by id) a " +
         "left outer join (select id, count(*) as cnt from large_table where id < 200 group by id) b " +
         "on a.id = b.id"
  );

  try {
    String expected =
        "?count\n" +
            "-------------------------------\n" +
            "200\n";

    assertEquals(expected, resultSetToString(res));
  } finally {
    testingCluster.setAllTajoDaemonConfValue(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME.varname, "" + originConfValue);
    cleanupQuery(res);
    executeString("DROP TABLE large_table PURGE").close();
  }
}
{code}



--
This message was sent by Atlassian JIRA
(v6.2#6252)