You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hive.apache.org by εΎεŽšι“ <xu...@gmail.com> on 2011/03/24 07:35:54 UTC

how to write a serde,i want support nutchsequence file .

can anyman help me ?or give me some documents relates.

i wand load data from nutch's sequence file.

my code is under.
i can execute load data script normal.
but when i run ' select * from table ' ,the error occur.
it prompt me 'Content cast Exception when   'Iterator<Writable> values =
(Iterator<Writable>)blob;' '


thanks help!

public class NutchSequenceFileSerDe implements SerDe {

  public static final Log LOG =
LogFactory.getLog(NutchSequenceFileSerDe.class.getName());

  int numColumns;
  String inputRegex;
  String outputFormatString;

  Pattern inputPattern;

  StructObjectInspector rowOI;
  ArrayList<String> row;

  @Override
  public void initialize(Configuration conf, Properties tbl)
      throws SerDeException {

    // We can get the table definition from tbl.

    // Read the configuration parameters

    String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS);
    String columnTypeProperty =
tbl.getProperty(Constants.LIST_COLUMN_TYPES);

    List<String> columnNames = Arrays.asList(columnNameProperty.split(","));
    List<TypeInfo> columnTypes = TypeInfoUtils
        .getTypeInfosFromTypeString(columnTypeProperty);
    assert columnNames.size() == columnTypes.size();
    numColumns = columnNames.size();

    // All columns have to be of type STRING.
    for (int c = 0; c < numColumns; c++) {
      if (!columnTypes.get(c).equals(TypeInfoFactory.stringTypeInfo)) {
        throw new SerDeException(getClass().getName()
            + " only accepts string columns, but column[" + c + "] named "
            + columnNames.get(c) + " has type " + columnTypes.get(c));
      }
    }

    // Constructing the row ObjectInspector:
    // The row consists of some string columns, each column will be a java
    // String object.
    List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(
        columnNames.size());
    for (int c = 0; c < numColumns; c++) {

 columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    }
    // StandardStruct uses ArrayList to store the row.
    rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
        columnNames, columnOIs);

    // Constructing the row object, etc, which will be reused for all rows.
    row = new ArrayList<String>(numColumns);
    for (int c = 0; c < numColumns; c++) {
      row.add(null);
    }
    outputFields = new Object[numColumns];
    outputRowText = new Text();
  }

  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return rowOI;
  }

  @Override
  public Class<? extends Writable> getSerializedClass() {
    return Text.class;
  }

  // Number of rows not matching the regex
  long unmatchedRows = 0;
  long nextUnmatchedRows = 1;
  // Number of rows that match the regex but have missing groups.
  long partialMatchedRows = 0;
  long nextPartialMatchedRows = 1;

  long getNextNumberToDisplay(long now) {
    return now * 10;
  }

  @Override
  public Object deserialize(Writable blob) throws SerDeException {

      Iterator<Writable> values = (Iterator<Writable>)blob;

      //dump.append(key.toString() + "\001");
      int colIndex =0;
      while (values.hasNext()) {
        try {
          Writable value = values.next();//.get(); // unwrap
          String vString = value.toString();
          row.set(colIndex,vString);

        } catch (RuntimeException e) {

          row.set(colIndex, null);
        }
        colIndex ++;
      }

    return row;
  }

  Object[] outputFields;
  Text outputRowText;

  @Override
  public Writable serialize(Object obj, ObjectInspector objInspector)
      throws SerDeException {
    // Get all the fields out.
    // NOTE: The correct way to get fields out of the row is to use
    // objInspector.
    // The obj can be a Java ArrayList, or a Java class, or a byte[] or
    // whatever.
    // The only way to access the data inside the obj is through
    // ObjectInspector.

    StructObjectInspector outputRowOI = (StructObjectInspector)
objInspector;
    List<? extends StructField> outputFieldRefs = outputRowOI
        .getAllStructFieldRefs();
    if (outputFieldRefs.size() != numColumns) {
      throw new SerDeException("Cannot serialize the object because there
are "
          + outputFieldRefs.size() + " fields but the table has " +
numColumns
          + " columns.");
    }

    // Get all data out.
    for (int c = 0; c < numColumns; c++) {
      Object field = outputRowOI
          .getStructFieldData(obj, outputFieldRefs.get(c));
      ObjectInspector fieldOI = outputFieldRefs.get(c)
          .getFieldObjectInspector();
      // The data must be of type String
      StringObjectInspector fieldStringOI = (StringObjectInspector) fieldOI;
      // Convert the field to Java class String, because objects of String
type
      // can be
      // stored in String, Text, or some other classes.
      outputFields[c] = fieldStringOI.getPrimitiveJavaObject(field);
    }

    // Format the String
    String outputRowString = "";
    for (Object o : outputFieldRefs) {
      outputRowString += o.toString();
    }
    outputRowText.set(outputRowString);
    return outputRowText;
  }

}