You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/24 13:43:14 UTC
svn commit: r1668864 - in /uima/ruta/trunk/ruta-core/src:
main/java/org/apache/uima/ruta/resource/
test/java/org/apache/uima/ruta/action/
test/resources/org/apache/uima/ruta/action/
Author: pkluegl
Date: Tue Mar 24 12:43:14 2015
New Revision: 1668864
URL: http://svn.apache.org/r1668864
Log:
UIMA-4277
- applied patch
- added TrieTest_compressed.ruta
- added missing trie_compressed.mtwl
Added:
uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta
uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl (with props)
Modified:
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordListPersistence.java Tue Mar 24 12:43:14 2015
@@ -19,6 +19,7 @@
package org.apache.uima.ruta.resource;
+import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -26,6 +27,8 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
@@ -49,9 +52,40 @@ public class MultiTreeWordListPersistenc
readMTWL(root, new FileInputStream(path), "UTF-8");
}
+ /**
+ * Sniffs the content type for xml type.
+ *
+ * @param is
+ * the inputStream to sniff. Must support {@link InputStream#markSupported()}
+ * @return true if this stream starts with '<?xml'
+ */
+ public static boolean isSniffedXmlContentType(InputStream is)
+ throws IOException {
+ if (is == null)
+ throw new IOException("Stream is null");
+ if (!is.markSupported()){
+ throw new IOException("Cannot mark stream. just wrap it in a BufferedInputStream");
+ }
+ byte[] bytes = new byte[5]; // peek first five letters
+ is.mark(5);
+ is.read(bytes);
+ String prefix = new String(bytes);
+ is.reset();
+ if ("<?xml".equals(prefix)){
+ return true;
+ }
+ return false;
+ }
+
public void readMTWL(MultiTextNode root, InputStream stream, String encoding) throws IOException {
try {
- InputStreamReader streamReader = new InputStreamReader(stream, encoding);
+ InputStream is = new BufferedInputStream(stream); // adds mark/reset support
+ boolean isXml = isSniffedXmlContentType(is);
+ if (!isXml){ // MTWL is encoded
+ is = new ZipInputStream(is);
+ ((ZipInputStream)is).getNextEntry(); // zip must contain a single entry
+ }
+ InputStreamReader streamReader = new InputStreamReader(is, encoding);
TrieXMLEventHandler handler = new TrieXMLEventHandler(root);
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
SAXParser saxParser = saxParserFactory.newSAXParser();
@@ -75,7 +109,8 @@ public class MultiTreeWordListPersistenc
public void createMTWLFile(MultiTextNode root, String path, String encoding) {
try {
FileOutputStream output = new FileOutputStream(path);
- OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
+ ZipOutputStream zoutput = new ZipOutputStream(output);
+ OutputStreamWriter writer = new OutputStreamWriter(zoutput, encoding);
writer.write("<?xml version=\"1.0\" ?><root>");
for (MultiTextNode node : root.getChildren().values()) {
writeTextNode(writer, node);
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java Tue Mar 24 12:43:14 2015
@@ -19,6 +19,7 @@
package org.apache.uima.ruta.resource;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
@@ -31,6 +32,8 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
@@ -361,7 +364,13 @@ public class TreeWordList implements Rut
public void readXML(InputStream stream, String encoding) throws IOException {
try {
- InputStreamReader streamReader = new InputStreamReader(stream, encoding);
+ InputStream is = new BufferedInputStream(stream); // adds mark/reset support
+ boolean isXml = MultiTreeWordListPersistence.isSniffedXmlContentType(is);
+ if (!isXml){ // MTWL is encoded
+ is = new ZipInputStream(is);
+ ((ZipInputStream)is).getNextEntry(); // zip must contain a single entry
+ }
+ InputStreamReader streamReader = new InputStreamReader(is, encoding);
this.root = new TextNode();
XMLEventHandler handler = new XMLEventHandler(root);
SAXParserFactory factory = SAXParserFactory.newInstance();
@@ -389,7 +398,8 @@ public class TreeWordList implements Rut
public void createXMLFile(String path, String encoding) {
try {
FileOutputStream output = new FileOutputStream(path);
- OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
+ ZipOutputStream zoutput = new ZipOutputStream(output);
+ OutputStreamWriter writer = new OutputStreamWriter(zoutput, encoding);
writer.write("<?xml version=\"1.0\" ?>");
writer.write("<root>");
for (TextNode child : root.getChildren().values()) {
Modified: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java?rev=1668864&r1=1668863&r2=1668864&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java (original)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/action/TrieTest.java Tue Mar 24 12:43:14 2015
@@ -45,54 +45,58 @@ public class TrieTest {
String name = this.getClass().getSimpleName();
String namespace = this.getClass().getPackage().getName().replaceAll("\\.", "/");
- CAS cas = null;
- try {
- cas = RutaTestUtils.process(namespace + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/" + name
- + ".txt", 50, false, false, null, namespace + "/");
- } catch (Exception e) {
- e.printStackTrace();
- assert (false);
- }
- Type t = null;
- AnnotationIndex<AnnotationFS> ai = null;
- FSIterator<AnnotationFS> iterator = null;
+ for (String scriptname : new String[] { name, name + "_compressed" }) {
- t = RutaTestUtils.getTestType(cas, 1);
- ai = cas.getAnnotationIndex(t);
- assertEquals(3, ai.size());
- iterator = ai.iterator();
- assertEquals("Peter", iterator.next().getCoveredText());
- assertEquals("Marshall", iterator.next().getCoveredText());
- assertEquals("Joern", iterator.next().getCoveredText());
+ CAS cas = null;
+ try {
+ cas = RutaTestUtils.process(
+ namespace + "/" + scriptname + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/"
+ + name + ".txt", 50, false, false, null, namespace + "/");
+ } catch (Exception e) {
+ e.printStackTrace();
+ assert (false);
+ }
+ Type t = null;
+ AnnotationIndex<AnnotationFS> ai = null;
+ FSIterator<AnnotationFS> iterator = null;
- t = RutaTestUtils.getTestType(cas, 2);
- ai = cas.getAnnotationIndex(t);
- assertEquals(3, ai.size());
- iterator = ai.iterator();
- assertEquals("Kluegl", iterator.next().getCoveredText());
- assertEquals("Schor", iterator.next().getCoveredText());
- assertEquals("Kottmann", iterator.next().getCoveredText());
-
- t = RutaTestUtils.getTestType(cas, 3);
- ai = cas.getAnnotationIndex(t);
- assertEquals(3, ai.size());
- iterator = ai.iterator();
- assertEquals("Peter Kluegl", iterator.next().getCoveredText());
- assertEquals("Marshall Schor", iterator.next().getCoveredText());
- assertEquals("Joern Kottmann", iterator.next().getCoveredText());
+ t = RutaTestUtils.getTestType(cas, 1);
+ ai = cas.getAnnotationIndex(t);
+ assertEquals(3, ai.size());
+ iterator = ai.iterator();
+ assertEquals("Peter", iterator.next().getCoveredText());
+ assertEquals("Marshall", iterator.next().getCoveredText());
+ assertEquals("Joern", iterator.next().getCoveredText());
- t = RutaTestUtils.getTestType(cas, 4);
- ai = cas.getAnnotationIndex(t);
- assertEquals(3, ai.size());
- iterator = ai.iterator();
- assertEquals("Peter Kluegl: Ruta", iterator.next().getCoveredText());
- assertEquals("Marshall Schor: UIMA", iterator.next().getCoveredText());
- assertEquals("Joern Kottmann: CAS Editor", iterator.next().getCoveredText());
-
- cas.release();
-
+ t = RutaTestUtils.getTestType(cas, 2);
+ ai = cas.getAnnotationIndex(t);
+ assertEquals(3, ai.size());
+ iterator = ai.iterator();
+ assertEquals("Kluegl", iterator.next().getCoveredText());
+ assertEquals("Schor", iterator.next().getCoveredText());
+ assertEquals("Kottmann", iterator.next().getCoveredText());
+
+ t = RutaTestUtils.getTestType(cas, 3);
+ ai = cas.getAnnotationIndex(t);
+ assertEquals(3, ai.size());
+ iterator = ai.iterator();
+ assertEquals("Peter Kluegl", iterator.next().getCoveredText());
+ assertEquals("Marshall Schor", iterator.next().getCoveredText());
+ assertEquals("Joern Kottmann", iterator.next().getCoveredText());
+
+ t = RutaTestUtils.getTestType(cas, 4);
+ ai = cas.getAnnotationIndex(t);
+ assertEquals(3, ai.size());
+ iterator = ai.iterator();
+ assertEquals("Peter Kluegl: Ruta", iterator.next().getCoveredText());
+ assertEquals("Marshall Schor: UIMA", iterator.next().getCoveredText());
+ assertEquals("Joern Kottmann: CAS Editor", iterator.next().getCoveredText());
+
+ cas.release();
+
+ }
}
-
+
@Test
public void testWithFeature() {
String name = this.getClass().getSimpleName() + "WithFeature";
@@ -118,11 +122,11 @@ public class TrieTest {
features.put(typeNameC, listC);
String fnci = "c";
listC.add(new TestFeature(fnci, "", "uima.cas.Integer"));
-
try {
cas = RutaTestUtils.process(namespace + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION,
- namespace + "/" + this.getClass().getSimpleName() + ".txt", 50, false, false, complexTypes, features, namespace + "/");
+ namespace + "/" + this.getClass().getSimpleName() + ".txt", 50, false, false,
+ complexTypes, features, namespace + "/");
} catch (Exception e) {
e.printStackTrace();
assert (false);
@@ -132,7 +136,7 @@ public class TrieTest {
FSIterator<AnnotationFS> iterator = null;
AnnotationFS next = null;
Feature feature = null;
-
+
t = cas.getTypeSystem().getType(typeNameA);
feature = t.getFeatureByBaseName("a");
ai = cas.getAnnotationIndex(t);
@@ -147,7 +151,7 @@ public class TrieTest {
next = iterator.next();
assertEquals("Joern", next.getCoveredText());
assertEquals("first", next.getStringValue(feature));
-
+
t = cas.getTypeSystem().getType(typeNameB);
feature = t.getFeatureByBaseName("b");
ai = cas.getAnnotationIndex(t);
@@ -162,7 +166,7 @@ public class TrieTest {
next = iterator.next();
assertEquals("Kottmann", next.getCoveredText());
assertEquals(true, next.getBooleanValue(feature));
-
+
t = cas.getTypeSystem().getType(typeNameC);
feature = t.getFeatureByBaseName("c");
ai = cas.getAnnotationIndex(t);
Added: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta?rev=1668864&view=auto
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta (added)
+++ uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/TrieTest_compressed.ruta Tue Mar 24 12:43:14 2015
@@ -0,0 +1,9 @@
+PACKAGE org.apache.uima;
+
+WORDLIST list1 = 'trie_compressed.mtwl';
+
+DECLARE T1, T2, T3, T4, T5;
+
+Document{->TRIE("FirstNames.txt" = T1, "LastNames.txt" = T2,
+ "CompleteNames.txt" = T3, "NamesWithSystems.txt" = T4,
+ list1, true, 4, false, 0, ":")};
Added: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl?rev=1668864&view=auto
==============================================================================
Binary file - no diff available.
Propchange: uima/ruta/trunk/ruta-core/src/test/resources/org/apache/uima/ruta/action/trie_compressed.mtwl
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream