You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hive.apache.org by 王锋 <wf...@163.com> on 2012/08/09 13:04:19 UTC
a hive bug about udf
Hi,
the source code of my udf minf:
import org.apache.hadoop.hive.ql.exec.UDF;
import com.sina.dip.util.DateUtil;
public class Minf extends UDF {
public String evaluate(String time) {
if (time != null && time.trim().length() > 0) {
time = time.trim();
time = time.replace("[", "");
String yyyymmdd = DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "yyyyMMdd");
int HH = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "HH"));
int mm = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "mm"));
int minif = HH * 12 + mm / 5;
String Minif = yyyymmdd + String.format("%03d", minif);
if (Minif != null) {
if(Minif.contains("\\[")){
System.out.println("yyyymmdd="+yyyymmdd+"\tMinf="+Minif);
}
return Minif.trim().toString();
} else {
return "";
}
} else
return "";
}
public static void main(String[] args) {
String time = "[09/Aug/2012:16:49:59";
System.out.println(new Minf().evaluate(time));
}
}
the code can transform time from apache log to minf . for example '[09/Aug/2012:16:49:59' --> 20120809201
but when we use hive using udf to process apache logs, sometimes we found some of minfs were wrong,such as:
[09/Aug/2012:16:49:59 201208[09201
[09/Aug/2012:16:49:59 201208[09201
but only use the method evaluate code to run a mr ,the code:
public class TestMinf {
public static String evaluate(String time) {
if (time != null && time.trim().length() > 0) {
time = time.trim();
time = time.replace("[", "");
String yyyymmdd = DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "yyyyMMdd");
int HH = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "HH"));
int mm = Integer.parseInt(DateUtil.DateToString(DateUtil.engStrToDate(time,
"dd/MMM/yyyy:HH:mm:ss"), "mm"));
int minif = HH * 12 + mm / 5;
String Minif = yyyymmdd + String.format("%03d", minif);
if (Minif != null) {
if(Minif.contains("\\[")){
System.out.println("yyyymmdd="+yyyymmdd+"\tMinf="+Minif);
}
return Minif.trim().toString();
} else {
return "";
}
} else
return "";
}
public static class ExtendLogMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, LongWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, LongWritable> output,
Reporter reporter) throws IOException {
Text minf=new Text();
String line = value.toString();
Map data=RegexUtil.parseApache(line);
String createtime=(String)data.get("createtime");
String _minf=evaluate(createtime);
minf.set(_minf);
output.collect(minf, new LongWritable(1));
}
}
public static class ExtendLogReducer extends MapReduceBase implements
Reducer<Text, LongWritable, Text, LongWritable> {
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
long sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new LongWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration cf = new Configuration();
String[] otherArgs = new GenericOptionsParser(cf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: TestMinfMR <in> <out>");
System.exit(1);
}
JobConf conf = new JobConf(TestMinf.class);
conf.setJobName("testmr" );
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(LongWritable.class);
conf.setMapperClass(ExtendLogMapper.class);
conf.setCombinerClass(ExtendLogReducer.class);
conf.setReducerClass(ExtendLogReducer.class);
// conf.setPartitionerClass(MyPartitioner.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setNumReduceTasks(1);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
Using this java mr code,we could not found the wrong minf with the same data.
the source data was in 16 hour on 2012-08-09,and size was 24,739,162,624Byte.
We thought this question may be caused by hive udf.pls give me some suggestions.
Thanks.