You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/02 10:45:10 UTC
nutch git commit: NUTCH-1308 Add main() to ZipParser
Repository: nutch
Updated Branches:
refs/heads/master ecf2bb011 -> 5943d11ad
NUTCH-1308 Add main() to ZipParser
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5943d11a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5943d11a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5943d11a
Branch: refs/heads/master
Commit: 5943d11ad976f51ab0e861f8ed128ace950c246d
Parents: ecf2bb0
Author: Sebastian Nagel <sn...@apache.org>
Authored: Sat Jul 2 12:44:39 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:44:39 2016 +0200
----------------------------------------------------------------------
.../org/apache/nutch/parse/zip/ZipParser.java | 31 ++++++++++++++++++++
1 file changed, 31 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/5943d11a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
index 5d0c2f7..f441fd0 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -18,20 +18,26 @@
package org.apache.nutch.parse.zip;
import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
/**
@@ -110,4 +116,29 @@ public class ZipParser implements Parser {
return this.conf;
}
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.out.println("ZipParser <zip_file>");
+ System.exit(1);
+ }
+ File file = new File(args[0]);
+ String url = "file:"+file.getCanonicalPath();
+ FileInputStream in = new FileInputStream(file);
+ byte[] bytes = new byte[in.available()];
+ in.read(bytes);
+ in.close();
+ Configuration conf = NutchConfiguration.create();
+ ZipParser parser = new ZipParser();
+ parser.setConf(conf);
+ Metadata meta = new Metadata();
+ meta.add(Response.CONTENT_LENGTH, ""+file.length());
+ ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+ "application/zip", meta, conf));
+ Parse p = parseResult.get(url);
+ System.out.println(parseResult.size());
+ System.out.println("Parse Text:");
+ System.out.println(p.getText());
+ System.out.println("Parse Data:");
+ System.out.println(p.getData());
+ }
}