You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by spasam <gi...@git.apache.org> on 2017/07/14 17:34:21 UTC

[GitHub] orc pull request #131: ORC-199. Add convert from CSV.

Github user spasam commented on a diff in the pull request:

    https://github.com/apache/orc/pull/131#discussion_r127508667
  
    --- Diff: java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java ---
    @@ -18,53 +18,178 @@
     package org.apache.orc.tools.convert;
     
     import org.apache.commons.cli.CommandLine;
    -import org.apache.commons.cli.GnuParser;
    +import org.apache.commons.cli.DefaultParser;
     import org.apache.commons.cli.HelpFormatter;
     import org.apache.commons.cli.Option;
     import org.apache.commons.cli.Options;
     import org.apache.commons.cli.ParseException;
     import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.fs.FSDataInputStream;
    +import org.apache.hadoop.fs.FileSystem;
     import org.apache.hadoop.fs.Path;
     import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
     import org.apache.orc.OrcFile;
    +import org.apache.orc.Reader;
     import org.apache.orc.RecordReader;
     import org.apache.orc.TypeDescription;
     import org.apache.orc.Writer;
     import org.apache.orc.tools.json.JsonSchemaFinder;
     
     import java.io.IOException;
    +import java.io.InputStream;
    +import java.io.InputStreamReader;
    +import java.nio.charset.StandardCharsets;
    +import java.util.ArrayList;
    +import java.util.List;
    +import java.util.zip.GZIPInputStream;
     
     /**
    - * A conversion tool to convert JSON files into ORC files.
    + * A conversion tool to convert CSV or JSON files into ORC files.
      */
     public class ConvertTool {
    +  private final List<FileInformation> fileList;
    +  private final TypeDescription schema;
    +  private final char csvSeparator;
    +  private final char csvQuote;
    +  private final char csvEscape;
    +  private final int csvHeaderLines;
    +  private final String csvNullString;
    +  private final Writer writer;
    +  private final VectorizedRowBatch batch;
     
    -  static TypeDescription computeSchema(String[] filename) throws IOException {
    +  TypeDescription buildSchema(List<FileInformation> files,
    +                              Configuration conf) throws IOException {
         JsonSchemaFinder schemaFinder = new JsonSchemaFinder();
    -    for(String file: filename) {
    -      System.err.println("Scanning " + file + " for schema");
    -      schemaFinder.addFile(file);
    +    for(FileInformation file: files) {
    +      if (file.format == Format.JSON) {
    +        System.err.println("Scanning " + file.path + " for schema");
    +        schemaFinder.addFile(file.getReader(file.filesystem.open(file.path)));
    +      } else if (file.format == Format.ORC) {
    +        System.err.println("Merging schema from " + file.path);
    +        Reader reader = OrcFile.createReader(file.path,
    +            OrcFile.readerOptions(conf)
    +                .filesystem(file.filesystem));
    +        schemaFinder.addSchema(reader.getSchema());
    +      }
         }
         return schemaFinder.getSchema();
    --- End diff --
    
    This is throwing NPE if no command line arguments are specified except for CSV file:
    
    ```
    Exception in thread "main" java.lang.NullPointerException
    	at org.apache.orc.tools.json.JsonSchemaFinder.getSchema(JsonSchemaFinder.java:321)
    	at org.apache.orc.tools.convert.ConvertTool.buildSchema(ConvertTool.java:75)
    
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---