You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/02 10:45:10 UTC

nutch git commit: NUTCH-1308 Add main() to ZipParser

Repository: nutch
Updated Branches:
  refs/heads/master ecf2bb011 -> 5943d11ad


NUTCH-1308 Add main() to ZipParser


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5943d11a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5943d11a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5943d11a

Branch: refs/heads/master
Commit: 5943d11ad976f51ab0e861f8ed128ace950c246d
Parents: ecf2bb0
Author: Sebastian Nagel <sn...@apache.org>
Authored: Sat Jul 2 12:44:39 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:44:39 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/parse/zip/ZipParser.java   | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/5943d11a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
index 5d0c2f7..f441fd0 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -18,20 +18,26 @@
 package org.apache.nutch.parse.zip;
 
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
 
 /**
@@ -110,4 +116,29 @@ public class ZipParser implements Parser {
     return this.conf;
   }
 
+  public static void main(String[] args) throws IOException {
+    if (args.length < 1) {
+      System.out.println("ZipParser <zip_file>");
+      System.exit(1);
+    }
+    File file = new File(args[0]);
+    String url = "file:"+file.getCanonicalPath();
+    FileInputStream in = new FileInputStream(file);
+    byte[] bytes = new byte[in.available()];
+    in.read(bytes);
+    in.close();
+    Configuration conf = NutchConfiguration.create();
+    ZipParser parser = new ZipParser();
+    parser.setConf(conf);
+    Metadata meta = new Metadata();
+    meta.add(Response.CONTENT_LENGTH, ""+file.length());
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/zip", meta, conf));
+    Parse p = parseResult.get(url);
+    System.out.println(parseResult.size());
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
 }