You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/01/24 16:31:14 UTC

svn commit: r1062829 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/CLI.java cmdline/doccat/DoccatConverterTool.java formats/LeipzigDocumentSampleStreamFactory.java

Author: joern
Date: Mon Jan 24 15:31:14 2011
New Revision: 1062829

URL: http://svn.apache.org/viewvc?rev=1062829&view=rev
Log:
OPENNLP-79 Addec CLI support to convert the leipzig data to doccat training data

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java   (with props)
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java?rev=1062829&r1=1062828&r2=1062829&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java Mon Jan 24 15:31:14 2011
@@ -30,6 +30,9 @@ import opennlp.tools.cmdline.chunker.Chu
 import opennlp.tools.cmdline.chunker.ChunkerEvaluatorTool;
 import opennlp.tools.cmdline.chunker.ChunkerMETool;
 import opennlp.tools.cmdline.chunker.ChunkerTrainerTool;
+import opennlp.tools.cmdline.doccat.DoccatConverterTool;
+import opennlp.tools.cmdline.doccat.DoccatTool;
+import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
 import opennlp.tools.cmdline.namefind.CensusDictionaryCreatorTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderConverterTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderEvaluatorTool;
@@ -67,6 +70,11 @@ public final class CLI {
     
     List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
     
+    // Docoument Categorizer
+    tools.add(new DoccatTool());
+    tools.add(new DoccatTrainerTool());
+    tools.add(new DoccatConverterTool());
+    
     // Tokenizer
     tools.add(new SimpleTokenizerTool());
     tools.add(new TokenizerMETool());

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java?rev=1062829&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java Mon Jan 24 15:31:14 2011
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.doccat;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
+
+public class DoccatConverterTool extends AbstractConverterTool<DocumentSample> {
+
+  private static final Map<String, ObjectStreamFactory<DocumentSample>> streamFactories;
+  
+  static {
+    Map<String, ObjectStreamFactory<DocumentSample>> mutableStreamFactories =
+      new HashMap<String, ObjectStreamFactory<DocumentSample>>();
+    
+    mutableStreamFactories.put("leipzig", new LeipzigDocumentSampleStreamFactory());
+    
+    streamFactories = Collections.unmodifiableMap(mutableStreamFactories);
+  }
+  
+  public String getName() {
+    return "DoccatConverter";
+  }
+
+  public String getShortDescription() {
+    return "";
+  }
+
+  @Override
+  protected ObjectStreamFactory<DocumentSample> createStreamFactory(String format) {
+    return streamFactories.get(format);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java?rev=1062829&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java Mon Jan 24 15:31:14 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigDocumentSampleStreamFactory implements ObjectStreamFactory<DocumentSample> {
+
+  interface Parameters {
+    @ParameterDescription(valueName = "cat|de|dk|ee|en|fi|fr|it|jp|kr|nl|no|se|sorb|tr")
+    String getLang();
+    
+    @ParameterDescription(valueName = "sampleData")
+    String getData();
+  }
+  
+  public String getUsage() {
+    return ArgumentParser.createUsage(Parameters.class);
+  }
+  
+  public boolean validateArguments(String[] args) {
+    return ArgumentParser.validateArguments(args, Parameters.class);
+  }
+  
+  public ObjectStream<DocumentSample> create(String[] args) {
+    
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    try {
+      return new LeipzigDoccatSampleStream(params.getLang(), 20,
+          CmdLineUtil.openInFile(new File(params.getData())));
+    } catch (IOException e) {
+      System.err.println("Cannot open sample data: " + e.getMessage());
+      throw new TerminateToolException(-1);
+    }
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain