You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/24 13:43:14 UTC

svn commit: r1668864 - in /uima/ruta/trunk/ruta-core/src: main/java/org/apache/uima/ruta/resource/ test/java/org/apache/uima/ruta/action/ test/resources/org/apache/uima/ruta/action/

Author: pkluegl
Date: Tue Mar 24 12:43:14 2015
New Revision: 1668864

URL: http://svn.apache.org/r1668864
Log:
UIMA-4277
- applied patch
- added TrieTest_compressed.ruta
- added missing trie_compressed.mtwl

Added:
    uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta
    uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl   (with props)
Modified:
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
    uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java Tue Mar 24 12:43:14 2015
@@ -19,6 +19,7 @@
 
 package org.apache.uima.ruta.resource;
 
+import java.io.BufferedInputStream;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -26,6 +27,8 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
 
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
@@ -49,9 +52,40 @@ public class MultiTreeWordListPersistenc
     readMTWL(root, new FileInputStream(path), "UTF-8");
   }
 
+  /**
+   * Sniffs the content type for xml type.
+   * 
+   * @param is
+   *            the inputStream to sniff. Must support {@link InputStream#markSupported()}
+   * @return true if this stream starts with '<?xml'
+   */
+  public static boolean isSniffedXmlContentType(InputStream is)
+          throws IOException {
+      if (is == null)
+          throw new IOException("Stream is null");
+      if (!is.markSupported()){
+          throw new IOException("Cannot mark stream. just wrap it in a BufferedInputStream");
+      }
+      byte[] bytes = new byte[5]; // peek first five letters
+      is.mark(5);
+      is.read(bytes);
+      String prefix = new String(bytes);
+      is.reset();
+      if ("<?xml".equals(prefix)){
+          return true;
+      }
+      return false;
+  }
+
   public void readMTWL(MultiTextNode root, InputStream stream, String encoding) throws IOException {
     try {
-      InputStreamReader streamReader = new InputStreamReader(stream, encoding);
+      InputStream is = new BufferedInputStream(stream); // adds mark/reset support
+      boolean isXml = isSniffedXmlContentType(is);
+      if (!isXml){ // MTWL is encoded
+          is = new ZipInputStream(is);
+          ((ZipInputStream)is).getNextEntry(); // zip must contain a single entry
+      }
+      InputStreamReader streamReader = new InputStreamReader(is, encoding);
       TrieXMLEventHandler handler = new TrieXMLEventHandler(root);
       SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
       SAXParser saxParser = saxParserFactory.newSAXParser();
@@ -75,7 +109,8 @@ public class MultiTreeWordListPersistenc
   public void createMTWLFile(MultiTextNode root, String path, String encoding) {
     try {
       FileOutputStream output = new FileOutputStream(path);
-      OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
+      ZipOutputStream zoutput = new ZipOutputStream(output);
+      OutputStreamWriter writer = new OutputStreamWriter(zoutput, encoding);
       writer.write("<?xml version=\"1.0\" ?><root>");
       for (MultiTextNode node : root.getChildren().values()) {
         writeTextNode(writer, node);

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java Tue Mar 24 12:43:14 2015
@@ -19,6 +19,7 @@
 
 package org.apache.uima.ruta.resource;
 
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
@@ -31,6 +32,8 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
 
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
@@ -361,7 +364,13 @@ public class TreeWordList implements Rut
 
   public void readXML(InputStream stream, String encoding) throws IOException {
     try {
-      InputStreamReader streamReader = new InputStreamReader(stream, encoding);
+      InputStream is = new BufferedInputStream(stream); // adds mark/reset support
+      boolean isXml = MultiTreeWordListPersistence.isSniffedXmlContentType(is);
+      if (!isXml){ // MTWL is encoded
+        is = new ZipInputStream(is);
+        ((ZipInputStream)is).getNextEntry(); // zip must contain a single entry
+      }
+      InputStreamReader streamReader = new InputStreamReader(is, encoding);
       this.root = new TextNode();
       XMLEventHandler handler = new XMLEventHandler(root);
       SAXParserFactory factory = SAXParserFactory.newInstance();
@@ -389,7 +398,8 @@ public class TreeWordList implements Rut
   public void createXMLFile(String path, String encoding) {
     try {
       FileOutputStream output = new FileOutputStream(path);
-      OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
+      ZipOutputStream zoutput = new ZipOutputStream(output);
+      OutputStreamWriter writer = new OutputStreamWriter(zoutput, encoding);
       writer.write("<?xml version=\"1.0\" ?>");
       writer.write("<root>");
       for (TextNode child : root.getChildren().values()) {

Modified: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java (original)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java Tue Mar 24 12:43:14 2015
@@ -45,54 +45,58 @@ public class TrieTest {
     String name = this.getClass().getSimpleName();
     String namespace = this.getClass().getPackage().getName().replaceAll("\\.", "/");
 
-    CAS cas = null;
-    try {
-      cas = RutaTestUtils.process(namespace + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/" + name
-              + ".txt", 50, false, false, null, namespace + "/");
-    } catch (Exception e) {
-      e.printStackTrace();
-      assert (false);
-    }
-    Type t = null;
-    AnnotationIndex<AnnotationFS> ai = null;
-    FSIterator<AnnotationFS> iterator = null;
+    for (String scriptname : new String[] { name, name + "_compressed" }) {
 
-    t = RutaTestUtils.getTestType(cas, 1);
-    ai = cas.getAnnotationIndex(t);
-    assertEquals(3, ai.size());
-    iterator = ai.iterator();
-    assertEquals("Peter", iterator.next().getCoveredText());
-    assertEquals("Marshall", iterator.next().getCoveredText());
-    assertEquals("Joern", iterator.next().getCoveredText());
+      CAS cas = null;
+      try {
+        cas = RutaTestUtils.process(
+                namespace + "/" + scriptname + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/"
+                        + name + ".txt", 50, false, false, null, namespace + "/");
+      } catch (Exception e) {
+        e.printStackTrace();
+        assert (false);
+      }
+      Type t = null;
+      AnnotationIndex<AnnotationFS> ai = null;
+      FSIterator<AnnotationFS> iterator = null;
 
-    t = RutaTestUtils.getTestType(cas, 2);
-    ai = cas.getAnnotationIndex(t);
-    assertEquals(3, ai.size());
-    iterator = ai.iterator();
-    assertEquals("Kluegl", iterator.next().getCoveredText());
-    assertEquals("Schor", iterator.next().getCoveredText());
-    assertEquals("Kottmann", iterator.next().getCoveredText());
-    
-    t = RutaTestUtils.getTestType(cas, 3);
-    ai = cas.getAnnotationIndex(t);
-    assertEquals(3, ai.size());
-    iterator = ai.iterator();
-    assertEquals("Peter Kluegl", iterator.next().getCoveredText());
-    assertEquals("Marshall Schor", iterator.next().getCoveredText());
-    assertEquals("Joern Kottmann", iterator.next().getCoveredText());
+      t = RutaTestUtils.getTestType(cas, 1);
+      ai = cas.getAnnotationIndex(t);
+      assertEquals(3, ai.size());
+      iterator = ai.iterator();
+      assertEquals("Peter", iterator.next().getCoveredText());
+      assertEquals("Marshall", iterator.next().getCoveredText());
+      assertEquals("Joern", iterator.next().getCoveredText());
 
-    t = RutaTestUtils.getTestType(cas, 4);
-    ai = cas.getAnnotationIndex(t);
-    assertEquals(3, ai.size());
-    iterator = ai.iterator();
-    assertEquals("Peter Kluegl: Ruta", iterator.next().getCoveredText());
-    assertEquals("Marshall Schor: UIMA", iterator.next().getCoveredText());
-    assertEquals("Joern Kottmann: CAS Editor", iterator.next().getCoveredText());
-    
-    cas.release();
-    
+      t = RutaTestUtils.getTestType(cas, 2);
+      ai = cas.getAnnotationIndex(t);
+      assertEquals(3, ai.size());
+      iterator = ai.iterator();
+      assertEquals("Kluegl", iterator.next().getCoveredText());
+      assertEquals("Schor", iterator.next().getCoveredText());
+      assertEquals("Kottmann", iterator.next().getCoveredText());
+
+      t = RutaTestUtils.getTestType(cas, 3);
+      ai = cas.getAnnotationIndex(t);
+      assertEquals(3, ai.size());
+      iterator = ai.iterator();
+      assertEquals("Peter Kluegl", iterator.next().getCoveredText());
+      assertEquals("Marshall Schor", iterator.next().getCoveredText());
+      assertEquals("Joern Kottmann", iterator.next().getCoveredText());
+
+      t = RutaTestUtils.getTestType(cas, 4);
+      ai = cas.getAnnotationIndex(t);
+      assertEquals(3, ai.size());
+      iterator = ai.iterator();
+      assertEquals("Peter Kluegl: Ruta", iterator.next().getCoveredText());
+      assertEquals("Marshall Schor: UIMA", iterator.next().getCoveredText());
+      assertEquals("Joern Kottmann: CAS Editor", iterator.next().getCoveredText());
+
+      cas.release();
+
+    }
   }
-  
+
   @Test
   public void testWithFeature() {
     String name = this.getClass().getSimpleName() + "WithFeature";
@@ -118,11 +122,11 @@ public class TrieTest {
     features.put(typeNameC, listC);
     String fnci = "c";
     listC.add(new TestFeature(fnci, "", "uima.cas.Integer"));
-   
 
     try {
       cas = RutaTestUtils.process(namespace + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION,
-              namespace + "/" + this.getClass().getSimpleName() + ".txt", 50, false, false, complexTypes, features, namespace + "/");
+              namespace + "/" + this.getClass().getSimpleName() + ".txt", 50, false, false,
+              complexTypes, features, namespace + "/");
     } catch (Exception e) {
       e.printStackTrace();
       assert (false);
@@ -132,7 +136,7 @@ public class TrieTest {
     FSIterator<AnnotationFS> iterator = null;
     AnnotationFS next = null;
     Feature feature = null;
-    
+
     t = cas.getTypeSystem().getType(typeNameA);
     feature = t.getFeatureByBaseName("a");
     ai = cas.getAnnotationIndex(t);
@@ -147,7 +151,7 @@ public class TrieTest {
     next = iterator.next();
     assertEquals("Joern", next.getCoveredText());
     assertEquals("first", next.getStringValue(feature));
-    
+
     t = cas.getTypeSystem().getType(typeNameB);
     feature = t.getFeatureByBaseName("b");
     ai = cas.getAnnotationIndex(t);
@@ -162,7 +166,7 @@ public class TrieTest {
     next = iterator.next();
     assertEquals("Kottmann", next.getCoveredText());
     assertEquals(true, next.getBooleanValue(feature));
-    
+
     t = cas.getTypeSystem().getType(typeNameC);
     feature = t.getFeatureByBaseName("c");
     ai = cas.getAnnotationIndex(t);

Added: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta?rev=1668864&view=auto
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta (added)
+++ uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta Tue Mar 24 12:43:14 2015
@@ -0,0 +1,9 @@
+PACKAGE org.apache.uima;
+
+WORDLIST list1 = 'trie_compressed.mtwl';
+
+DECLARE T1, T2, T3, T4, T5;
+
+Document{->TRIE("FirstNames.txt" = T1, "LastNames.txt" = T2,
+    "CompleteNames.txt" = T3, "NamesWithSystems.txt" = T4,
+	list1, true, 4, false, 0, ":")};

Added: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl?rev=1668864&view=auto
==============================================================================
Binary file - no diff available.

Propchange: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream