You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/08/03 12:36:11 UTC

svn commit: r1368873 - /opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java

Author: joern
Date: Fri Aug  3 10:36:11 2012
New Revision: 1368873

URL: http://svn.apache.org/viewvc?rev=1368873&view=rev
Log:
OPENNLP-531 Output directory must now be passed in as an argument.

Modified:
    opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java

Modified: opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java?rev=1368873&r1=1368872&r2=1368873&view=diff
==============================================================================
--- opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java (original)
+++ opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java Fri Aug  3 10:36:11 2012
@@ -17,6 +17,7 @@
 
 package org.apache.opennlp.wikinews_importer;
 
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -39,25 +40,21 @@ import org.apache.uima.resource.metadata
 import org.xml.sax.SAXException;
 
 /**
- * Demo application which reads a compressed or uncompressed Wikipedia XML dump
- * file (depending on the given file extension <i>.gz</i>, <i>.bz2</i> or
- * <i>.xml</i>) and prints the title and wiki text.
- * 
+ * Demo application which reads an uncompressed Wikipedia XML dump
+ * file and writes each article as an XMI file.
  */
 public class WikinewsConverter {
 
-  /**
-   * Print title an content of all the wiki pages in the dump.
-   * 
-   */
   static class CASArticleFilter implements IArticleFilter {
 
     private final TypeSystemDescription tsDesc;
+    private final File outputFolder;
     private List<String> endOfArtilceMarkers = new ArrayList<String>();
     
-    CASArticleFilter(TypeSystemDescription tsDesc) {
+    CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {
       
       this.tsDesc = tsDesc;
+      this.outputFolder = outputFolder;
       
       endOfArtilceMarkers.add("{{haveyoursay}}");
       endOfArtilceMarkers.add("== Sources ==");
@@ -169,7 +166,8 @@ public class WikinewsConverter {
           // now serialize CAS
           OutputStream casOut = null;
           try {
-              casOut = new FileOutputStream("articles/" + titleToUri(page.getTitle()) + ".xmi");
+              casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
+            		  File.separator + titleToUri(page.getTitle()) + ".xmi");
               
               UimaUtil.serializeCASToXmi(articleCAS, casOut);
           }
@@ -193,21 +191,26 @@ public class WikinewsConverter {
    * @param args
    */
   public static void main(String[] args) throws Exception {
-    if (args.length != 1) {
-      System.err.println("Usage: Parser <XML-FILE>"); 
-      // TODO: add folder where file are written here
+    if (args.length != 2) {
+      System.err.println("Usage: Parser <XML-File> <Output-Folder>"); 
       System.exit(-1);
     }
     
+    // TODO: Should to be configurable!
     TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
         new FileInputStream("samples/TypeSystem.xml"));
 
+    File outputFolder = new File(args[1]);
+    outputFolder.mkdirs();
+    
     String bz2Filename = args[0];
     try {
-      IArticleFilter handler = new CASArticleFilter(tsDesc);
+      IArticleFilter handler = new CASArticleFilter(tsDesc, new File(args[1]));
       WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
       wxp.parse();
     } catch (Exception e) {
+      System.out.println("Parsing the corpus failed:");
+      System.out.println();
       e.printStackTrace();
     }
   }