You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/01/24 16:31:14 UTC
svn commit: r1062829 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/CLI.java cmdline/doccat/DoccatConverterTool.java
formats/LeipzigDocumentSampleStreamFactory.java
Author: joern
Date: Mon Jan 24 15:31:14 2011
New Revision: 1062829
URL: http://svn.apache.org/viewvc?rev=1062829&view=rev
Log:
OPENNLP-79 Addec CLI support to convert the leipzig data to doccat training data
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java (with props)
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java?rev=1062829&r1=1062828&r2=1062829&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java Mon Jan 24 15:31:14 2011
@@ -30,6 +30,9 @@ import opennlp.tools.cmdline.chunker.Chu
import opennlp.tools.cmdline.chunker.ChunkerEvaluatorTool;
import opennlp.tools.cmdline.chunker.ChunkerMETool;
import opennlp.tools.cmdline.chunker.ChunkerTrainerTool;
+import opennlp.tools.cmdline.doccat.DoccatConverterTool;
+import opennlp.tools.cmdline.doccat.DoccatTool;
+import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
import opennlp.tools.cmdline.namefind.CensusDictionaryCreatorTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderConverterTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderEvaluatorTool;
@@ -67,6 +70,11 @@ public final class CLI {
List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+ // Docoument Categorizer
+ tools.add(new DoccatTool());
+ tools.add(new DoccatTrainerTool());
+ tools.add(new DoccatConverterTool());
+
// Tokenizer
tools.add(new SimpleTokenizerTool());
tools.add(new TokenizerMETool());
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java?rev=1062829&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java Mon Jan 24 15:31:14 2011
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.doccat;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
+
+public class DoccatConverterTool extends AbstractConverterTool<DocumentSample> {
+
+ private static final Map<String, ObjectStreamFactory<DocumentSample>> streamFactories;
+
+ static {
+ Map<String, ObjectStreamFactory<DocumentSample>> mutableStreamFactories =
+ new HashMap<String, ObjectStreamFactory<DocumentSample>>();
+
+ mutableStreamFactories.put("leipzig", new LeipzigDocumentSampleStreamFactory());
+
+ streamFactories = Collections.unmodifiableMap(mutableStreamFactories);
+ }
+
+ public String getName() {
+ return "DoccatConverter";
+ }
+
+ public String getShortDescription() {
+ return "";
+ }
+
+ @Override
+ protected ObjectStreamFactory<DocumentSample> createStreamFactory(String format) {
+ return streamFactories.get(format);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatConverterTool.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java?rev=1062829&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java Mon Jan 24 15:31:14 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigDocumentSampleStreamFactory implements ObjectStreamFactory<DocumentSample> {
+
+ interface Parameters {
+ @ParameterDescription(valueName = "cat|de|dk|ee|en|fi|fr|it|jp|kr|nl|no|se|sorb|tr")
+ String getLang();
+
+ @ParameterDescription(valueName = "sampleData")
+ String getData();
+ }
+
+ public String getUsage() {
+ return ArgumentParser.createUsage(Parameters.class);
+ }
+
+ public boolean validateArguments(String[] args) {
+ return ArgumentParser.validateArguments(args, Parameters.class);
+ }
+
+ public ObjectStream<DocumentSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ try {
+ return new LeipzigDoccatSampleStream(params.getLang(), 20,
+ CmdLineUtil.openInFile(new File(params.getData())));
+ } catch (IOException e) {
+ System.err.println("Cannot open sample data: " + e.getMessage());
+ throw new TerminateToolException(-1);
+ }
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain