You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/09/30 13:10:29 UTC

svn commit: r1177597 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats: LeipzigDoccatSampleStream.java LeipzigDocumentSampleStreamFactory.java

Author: joern
Date: Fri Sep 30 11:10:28 2011
New Revision: 1177597

URL: http://svn.apache.org/viewvc?rev=1177597&view=rev
Log:
OPENNLP-305 Replaced encoding lookup with UTF-8 encoding, and removed restriction on specific language codes.

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java?rev=1177597&r1=1177596&r2=1177597&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java Fri Sep 30 11:10:28 2011
@@ -52,51 +52,11 @@ public class LeipzigDoccatSampleStream e
    */
   LeipzigDoccatSampleStream(String language, int sentencesPerDocument, 
       InputStream in) throws IOException {
-    super(new PlainTextByLineStream(in, mapLanguageToEncoding(language)));
+    super(new PlainTextByLineStream(in, "UTF-8"));
     this.language = language;
     this.sentencesPerDocument = sentencesPerDocument;
   }
   
-  /**
-   * Maps the language to the file encoding, if the encoding
-   * cannot be specified an IOException is thrown.
-   * 
-   * @return
-   * @throws IOException
-   */
-  private static String mapLanguageToEncoding(String language) throws IOException {
-    
-    if (language == null)
-      throw new NullPointerException("language parameter must not be null!");
-    
-    
-    Map<String, String> encodingMap = new HashMap<String, String>();
-    encodingMap.put("cat", "ISO-8859-1");
-    encodingMap.put("de", "ISO-8859-1");
-    encodingMap.put("dk", "ISO-8859-1");
-    encodingMap.put("ee", "ISO-8859-4");
-    encodingMap.put("en", "ISO-8859-1");
-    encodingMap.put("fi", "ISO-8859-1");
-    encodingMap.put("fr", "ISO-8859-1");
-    encodingMap.put("it", "ISO-8859-1");
-    encodingMap.put("jp", "UTF-8");
-    encodingMap.put("kr", "UTF-8");
-    encodingMap.put("nl", "ISO-8859-1");
-    encodingMap.put("no", "ISO-8859-1");
-    encodingMap.put("se", "ISO-8859-1");
-    encodingMap.put("sorb", "ISO-8859-2");
-    encodingMap.put("tr", "ISO-8859-9");
-    
-    String encoding = encodingMap.get(language);
-    
-    if (encoding != null) {
-      return encoding;
-    }
-    else {
-      throw new IOException("Encoding for language " + language + " is not specified!");
-    }
-  }
-  
   public DocumentSample read() throws IOException {
 
     int count = 0;

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java?rev=1177597&r1=1177596&r2=1177597&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java Fri Sep 30 11:10:28 2011
@@ -34,7 +34,7 @@ import opennlp.tools.util.ObjectStream;
 public class LeipzigDocumentSampleStreamFactory implements ObjectStreamFactory<DocumentSample> {
 
   interface Parameters {
-    @ParameterDescription(valueName = "cat|de|dk|ee|en|fi|fr|it|jp|kr|nl|no|se|sorb|tr")
+    @ParameterDescription(valueName = "languageCode")
     String getLang();
     
     @ParameterDescription(valueName = "sampleData")