You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hive.apache.org by Dave Maughan <da...@gmail.com> on 2015/06/12 17:21:21 UTC

Writing uniontype to ORC file outside of hive

Hi,

I'm trying to write ORC files outside of hive and I'm currently
looking at uniontype.

I've identified multiple ways to do this. I could use a Writable or
Java StandardStructObjectInspector and pass in a StandardUnion.
However, the OrcInputFormat uses OrcStruct as the value type and I'd
like to keep writing consistent. This limits me to using
OrcStruct.createObjectInspector(). There are a couple of issues I've
noticed with this:

1. It's not possible to set a union field of an OrcStruct using a
StandardUnion as it generates a ClassCastException because
OrcUnionObjectInspector.getTag(Object obj) casts obj to OrcUnion to
return the private OrcUnion.tag field.
  - If the getTag method just delegated to UnionObject.getTag() and
the getField method did the same then it would be possible to use
StandardUnion instead.
2. It's not possible to create an OrcUnion instance or set its
tag/value because the class and the method are package private.
  - If these were public then it would be possible to use OrcUnion
instead of StandardUnion and avoid the issue in 1.

I can work around this by creating my own public factory for OrcUnion
in the org.apache.hadoop.hive.ql.io.orc package but it's not ideal.

I've run tests of all 6 of the combinations of Writable Standard/Java
Standard/Orc StructObjectInspector and StandardUnion/OrcUnion (see
below). The only combination that *doesn't* work is
OrcStructObjectInspector with StandardUnion. With the current
class/method accessibility this is the only option available, which
leaves me with the ugly OrcUnion factory workaround.

Is there something I've missed? Is there a specific reason for these
two observations or were they just an oversight?

Thanks,
Dave


package org.apache.hadoop.hive.ql.io.orc;

public final class OrcUnionFactory {
  private OrcUnionFactory() {
  }

  public static Object newInstance(byte tag, Object value) {
    OrcUnion union = new OrcUnion();
    union.set(tag, value);
    return union;
  }
}

-----

@RunWith(Parameterized.class)
public class OrcUnionTest {

  private static final byte TAG = (byte) 0;
  private static final int VALUE = 0;
  private static final TypeInfo TYPE_INFO =
TypeInfoUtils.getTypeInfoFromTypeString("struct<union:uniontype<int>>");

  @Rule
  public TemporaryFolder temporaryFolder = new TemporaryFolder();

  private final Configuration conf = new Configuration();

  private final SettableStructObjectInspector inspector;
  private final Object union;

  @Parameters(name = "{0}")
  public static Iterable<Object[]> tests() {
    List<Object[]> tests = new ArrayList<>();

    tests.add(new Object[] { "StandardJava_StandardUnion",
        TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(TYPE_INFO),
        new StandardUnion(TAG, VALUE) });
    tests.add(new Object[] { "StandardJava_OrcUnion",
        TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(TYPE_INFO),
        OrcUnionFactory.newInstance(TAG, VALUE) });
    tests.add(new Object[] { "StandardWritable_StandardUnion",
        TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(TYPE_INFO),
        new StandardUnion(TAG, new IntWritable(VALUE)) });
    tests.add(new Object[] { "StandardWritable_OrcUnion",
        TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(TYPE_INFO),
        OrcUnionFactory.newInstance(TAG, new IntWritable(VALUE)) });
    tests.add(new Object[] { "Orc_StandardUnion",
        OrcStruct.createObjectInspector(TYPE_INFO),
        new StandardUnion(TAG, new IntWritable(VALUE)) });
    tests.add(new Object[] { "Orc_OrcUnion",
        OrcStruct.createObjectInspector(TYPE_INFO),
        OrcUnionFactory.newInstance(TAG, new IntWritable(VALUE)) });

    return tests;
  }

  public OrcUnionTest(String testName, SettableStructObjectInspector
inspector, Object union) {
    this.inspector = inspector;
    this.union = union;
  }

  @Test
  public void exerciseTest() throws IOException {
    Path path = new Path(temporaryFolder.getRoot().getCanonicalPath(),
"part-00000");

    WriterOptions writerOptions =
OrcFile.writerOptions(conf).inspector(inspector);
    Writer writer = OrcFile.createWriter(path, writerOptions);

    Object row = inspector.create();
    inspector.setStructFieldData(row,
inspector.getStructFieldRef("union"), union);

    writer.addRow(row);

    writer.close();

    ReaderOptions readerOptions = OrcFile.readerOptions(conf);
    Reader reader = OrcFile.createReader(path, readerOptions);
    ObjectInspector inspector = reader.getObjectInspector();
    RecordReader rows = reader.rows();
    List<Object> readRow = (List<Object>)
ObjectInspectorUtils.copyToStandardJavaObject(rows.next(null),
inspector);

    assertThat(readRow.size(), is(1));
    assertThat(readRow.get(0), is((Object) VALUE));
    assertFalse(rows.hasNext());
  }

}