You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hive.apache.org by 王锋 <wf...@163.com> on 2012/08/09 13:04:19 UTC

a hive bug about udf

Hi,


    the source code of my udf minf:


import org.apache.hadoop.hive.ql.exec.UDF;


import com.sina.dip.util.DateUtil;


public class Minf extends UDF {


    public String evaluate(String time) {


        if (time != null && time.trim().length() > 0) {
            time = time.trim();
            time = time.replace("[", "");
            String yyyymmdd = DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "yyyyMMdd");
           
            int HH = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "HH"));
            int mm = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "mm"));
            int minif = HH * 12 + mm / 5;
            String Minif = yyyymmdd + String.format("%03d", minif);


            if (Minif != null) {
                if(Minif.contains("\\[")){
                    System.out.println("yyyymmdd="+yyyymmdd+"\tMinf="+Minif);
                }
                return Minif.trim().toString();
            } else {
                return "";
            }


        } else
            return "";
    }


    public static void main(String[] args) {
        String time = "[09/Aug/2012:16:49:59";
        System.out.println(new Minf().evaluate(time));
    }
}


the code can transform  time from apache log to minf . for example '[09/Aug/2012:16:49:59' --> 20120809201
but when we use hive using udf  to process apache logs, sometimes we found some of minfs were wrong,such as:


[09/Aug/2012:16:49:59   201208[09201
[09/Aug/2012:16:49:59   201208[09201
 
but only use the method evaluate code to run a mr ,the code:


public class TestMinf {


    public static String evaluate(String time) {


        if (time != null && time.trim().length() > 0) {
            time = time.trim();
            time = time.replace("[", "");
            String yyyymmdd = DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "yyyyMMdd");
           
            int HH = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "HH"));
            int mm = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
                    "dd/MMM/yyyy:HH:mm:ss"), "mm"));
            int minif = HH * 12 + mm / 5;
            String Minif = yyyymmdd + String.format("%03d", minif);


            if (Minif != null) {
                if(Minif.contains("\\[")){
                    System.out.println("yyyymmdd="+yyyymmdd+"\tMinf="+Minif);
                }
                return Minif.trim().toString();
            } else {
                return "";
            }


        } else
            return "";
    }
    


    public static class ExtendLogMapper extends MapReduceBase implements
            Mapper<LongWritable, Text, Text, LongWritable> {


        public void map(LongWritable key, Text value, OutputCollector<Text, LongWritable> output,
                Reporter reporter) throws IOException {
           
            Text minf=new Text();
                String line = value.toString();
                Map data=RegexUtil.parseApache(line);
                String createtime=(String)data.get("createtime");
                String _minf=evaluate(createtime);
                minf.set(_minf);
                output.collect(minf, new LongWritable(1)); 
        }


    }


    public static class ExtendLogReducer extends MapReduceBase implements
            Reducer<Text, LongWritable, Text, LongWritable> {
        public void reduce(Text key, Iterator<LongWritable> values,
                OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
            long sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new LongWritable(sum));
        }
    }


    public static void main(String[] args) throws Exception {


        Configuration cf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(cf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: TestMinfMR  <in> <out>");
            System.exit(1);
        }
        JobConf conf = new JobConf(TestMinf.class);
        conf.setJobName("testmr" );


        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(LongWritable.class);


        conf.setMapperClass(ExtendLogMapper.class);
        conf.setCombinerClass(ExtendLogReducer.class);
        conf.setReducerClass(ExtendLogReducer.class);
//        conf.setPartitionerClass(MyPartitioner.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setNumReduceTasks(1);
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));


        JobClient.runJob(conf);


    }
}


Using  this java mr code,we could not found the wrong minf with the same data.
 the source data was in 16 hour  on 2012-08-09,and size was 24,739,162,624Byte.


We thought this question may be caused by hive udf.pls give me some suggestions.
Thanks.