You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/08/06 14:48:40 UTC

svn commit: r1369821 - in /opennlp/sandbox/corpus-server-connector: desc/ src/main/java/org/apache/opennlp/corpus_server/connector/

Author: joern
Date: Mon Aug  6 12:48:40 2012
New Revision: 1369821

URL: http://svn.apache.org/viewvc?rev=1369821&view=rev
Log:
OPENNLP-261 Implemented CAS write support. 

Modified:
    opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml
    opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml
    opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java
    opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java

Modified: opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml (original)
+++ opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml Mon Aug  6 12:48:40 2012
@@ -1,68 +1,60 @@
 <?xml version="1.0" encoding="UTF-8"?>
 
-	<!--
-	 ***************************************************************
-	 * Licensed to the Apache Software Foundation (ASF) under one
-	 * or more contributor license agreements.  See the NOTICE file
-	 * distributed with this work for additional information
-	 * regarding copyright ownership.  The ASF licenses this file
-	 * to you under the Apache License, Version 2.0 (the
-	 * "License"); you may not use this file except in compliance
-	 * with the License.  You may obtain a copy of the License at
-         *
-	 *   http://www.apache.org/licenses/LICENSE-2.0
-	 * 
-	 * Unless required by applicable law or agreed to in writing,
-	 * software distributed under the License is distributed on an
-	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-	 * KIND, either express or implied.  See the License for the
-	 * specific language governing permissions and limitations
-	 * under the License.
-	 ***************************************************************
-   -->
-   
+<!-- *************************************************************** * Licensed 
+	to the Apache Software Foundation (ASF) under one * or more contributor license 
+	agreements. See the NOTICE file * distributed with this work for additional 
+	information * regarding copyright ownership. The ASF licenses this file * 
+	to you under the Apache License, Version 2.0 (the * "License"); you may not 
+	use this file except in compliance * with the License. You may obtain a copy 
+	of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless 
+	required by applicable law or agreed to in writing, * software distributed 
+	under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES 
+	OR CONDITIONS OF ANY * KIND, either express or implied. See the License for 
+	the * specific language governing permissions and limitations * under the 
+	License. *************************************************************** -->
+
 <analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
-  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
-  <primitive>true</primitive>
-  <annotatorImplementationName>org.apache.opennlp.corpus_server.connector.CSCasWriter</annotatorImplementationName>
-  <analysisEngineMetaData>
-    <name>Corpus Server Cas Writer</name>
-    <description>Writes a CAS (back) to the Corpus Server.</description>
-    <configurationParameters>
-		<configurationParameter>
-			<name>CorpusAddress</name>
-			<description>Directory containing input files</description>
-			<type>String</type>
-			<multiValued>false</multiValued>
-			<mandatory>true</mandatory>
-		</configurationParameter>
-    </configurationParameters>
-    <configurationParameterSettings>
-		<nameValuePair>
-			<name>CorpusAddress</name>
-			<value>
-				<string>http://localhost:8080/corpus-server/rest/corpora/wikinews</string>
-			</value>
-		</nameValuePair>
-    </configurationParameterSettings>
-    
-	<!-- TODO: Can TS be imported via http?! Otherwise it must be downloaded by the user! -->
-	<typeSystemDescription>
-		<imports>
-			<import name="org.apache.uima.examples.SourceDocumentInformation"/>
-		</imports>
-	</typeSystemDescription>
-    <capabilities>
-      <capability>
-        <inputs/>
-        <outputs/>
-        <languagesSupported/>
-      </capability>
-    </capabilities>
-	<operationalProperties>
-		<modifiesCas>true</modifiesCas>
-		<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
-		<outputsNewCASes>false</outputsNewCASes>
-	</operationalProperties>
-  </analysisEngineMetaData>
+	<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+	<primitive>true</primitive>
+	<annotatorImplementationName>org.apache.opennlp.corpus_server.connector.CSCasWriter
+	</annotatorImplementationName>
+	<analysisEngineMetaData>
+		<name>Corpus Server Cas Writer</name>
+		<description>Writes a CAS (back) to the Corpus Server.</description>
+		<configurationParameters>
+			<configurationParameter>
+				<name>ServerAddress</name>
+				<description>The location of the server.</description>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>true</mandatory>
+			</configurationParameter>
+			<configurationParameter>
+				<name>CorpusName</name>
+				<description>The id of the corpus which should be processed.
+				</description>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>true</mandatory>
+			</configurationParameter>
+		</configurationParameters>
+
+		<typeSystemDescription>
+			<imports>
+				<import name="org.apache.uima.examples.SourceDocumentInformation" />
+			</imports>
+		</typeSystemDescription>
+		<capabilities>
+			<capability>
+				<inputs />
+				<outputs />
+				<languagesSupported />
+			</capability>
+		</capabilities>
+		<operationalProperties>
+			<modifiesCas>true</modifiesCas>
+			<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+			<outputsNewCASes>false</outputsNewCASes>
+		</operationalProperties>
+	</analysisEngineMetaData>
 </analysisEngineDescription>

Modified: opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml (original)
+++ opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml Mon Aug  6 12:48:40 2012
@@ -63,13 +63,13 @@
 		    <nameValuePair>
 		        <name>ServerAddress</name>
 		        <value>
-		            <string>http://localhost:8080/corpus-server/rest</string>
+		            <string>http://localhost:8080/rest</string>
 		        </value>
 		    </nameValuePair>
 		    <nameValuePair>
 		        <name>CorpusName</name>
 		        <value>
-		            <string>wikinews</string>
+		            <string>enwikinews</string>
 		        </value>
 		    </nameValuePair>
 		    <nameValuePair>

Modified: opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java (original)
+++ opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java Mon Aug  6 12:48:40 2012
@@ -19,42 +19,121 @@ package org.apache.opennlp.corpus_server
 
 import java.io.ByteArrayOutputStream;
 
+import javax.ws.rs.core.MediaType;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.impl.XmiCasSerializer;
 import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+import org.xml.sax.SAXException;
+
+import com.sun.jersey.api.client.Client;
+import com.sun.jersey.api.client.ClientResponse;
+import com.sun.jersey.api.client.WebResource;
+import com.sun.jersey.api.client.WebResource.Builder;
 
 /**
- * a {@link org.apache.uima.collection.CasConsumer} which puts a passed {@link CAS}
- * inside a {@link CorpusStore}
+ * The CSCasWriter writes a CAS into a Corpus Server.
  */
-public class CSCasWriter extends CasConsumer_ImplBase {
+public class CSCasWriter extends CasAnnotator_ImplBase {
 
-  private static final String CORPUSNAME = "corpusName";
+  private String serverAddress;
+  private String corpusName;
 
-//  private CorpusStore corpusStore;
+  // TODO: Make it configurable
+  private String action = "update";
+  
+  private Type idType;
+  private Feature idFeature;
+  private Logger logger;
 
   @Override
-  public void initialize() throws ResourceInitializationException {
-    super.initialize();
-//    String corpusName = String.valueOf(getConfigParameterValue(CORPUSNAME));
-//    try {
-//      corpusStore = CorpusServer.getInstance().getStore().getCorpus(corpusName);
-//    } catch (IOException e) {
-//      throw new ResourceInitializationException(e);
-//    }
+  public void initialize(UimaContext context)
+      throws ResourceInitializationException {
+    super.initialize(context);
+
+    serverAddress = (String) context.getConfigParameterValue(CSQueueCollectionReader.SERVER_ADDRESS);
+    corpusName = (String) context.getConfigParameterValue(CSQueueCollectionReader.CORPUS_NAME);
+    
+    logger = context.getLogger();
   }
 
   @Override
-  public void processCas(CAS cas) throws ResourceProcessException {
-    ByteArrayOutputStream os = new ByteArrayOutputStream();
-//    try {
-//      XmiCasSerializer.serialize(cas, os);
-//      corpusStore.addCAS(String.valueOf(cas.getDocumentAnnotation().getCoveredText().hashCode()), os.toByteArray());
-//    } catch (Exception e) {
-//      throw new ResourceProcessException(e);
-//    }
-
+  public void typeSystemInit(TypeSystem ts)
+      throws AnalysisEngineProcessException {
+    super.typeSystemInit(ts);
+
+    String idTypeName = (String) getContext().getConfigParameterValue("IdFSTypeName");
+    idType = ts.getType(idTypeName);
+    String idFeatureName = (String) getContext().getConfigParameterValue("IdFeatureName");
+    idFeature = idType.getFeatureByBaseName(idFeatureName);
   }
 
-}
\ No newline at end of file
+  @Override
+  public void process(CAS cas) throws AnalysisEngineProcessException {
+    
+    FSIterator<FeatureStructure> typeFSIter = cas.getIndexRepository().getAllIndexedFS(idType);
+
+    if (typeFSIter.hasNext()) {
+      FeatureStructure idFs = typeFSIter.next();
+
+      String casId = idFs.getFeatureValueAsString(idFeature);
+
+      // TODO: Remove the FS here, so its client side only!
+      // Was inserted in the reader ...
+      cas.removeFsFromIndexes(idFs);
+      
+      ByteArrayOutputStream xmiBytes = new ByteArrayOutputStream();
+      XmiCasSerializer serializer = new XmiCasSerializer(cas.getTypeSystem());
+      try {
+        serializer.serialize(cas, xmiBytes);
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException();
+      }
+      
+      Client client = Client.create();
+      
+      WebResource corpusWebResource = client.resource(serverAddress + "/corpora/"
+          + corpusName);
+      
+      Builder casResponseBuilder = corpusWebResource.path(casId)
+          .accept(MediaType.TEXT_XML).header("Content-Type", MediaType.TEXT_XML);
+      
+      ClientResponse response;
+      if ("add".equals(action)) {
+        response = casResponseBuilder.post(ClientResponse.class, xmiBytes);
+      }
+      else if ("update".equals(action)) {
+        response = casResponseBuilder.put(ClientResponse.class, xmiBytes);
+      }
+      else {
+        throw new AnalysisEngineProcessException(new Exception("Unkown action: " + action));
+      }
+      
+      int statusCode = response.getStatus();
+      
+      if (statusCode > 400) {
+        if (logger.isLoggable(Level.SEVERE)) {
+          logger.log(Level.SEVERE, "Error (" + statusCode + "), " + action + ", " + casId);
+        }
+      }
+      else {
+        if (logger.isLoggable(Level.FINE)) {
+          logger.log(Level.FINE, "OK (" + statusCode + "),  " + action + ", " + casId);
+        }
+      }
+    }
+    else {
+      throw new AnalysisEngineProcessException(new Exception("Missing Id Feature Structure!"));
+    }
+  }
+}

Modified: opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java (original)
+++ opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java Mon Aug  6 12:48:40 2012
@@ -26,9 +26,15 @@ import java.util.List;
 import javax.ws.rs.core.MediaType;
 
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.collection.CollectionException;
 import org.apache.uima.collection.CollectionReader_ImplBase;
 import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
 import org.apache.uima.util.Progress;
 
 import com.sun.jersey.api.client.Client;
@@ -41,9 +47,9 @@ import com.sun.jersey.api.client.WebReso
  */
 public class CSQueueCollectionReader extends CollectionReader_ImplBase {
 
-  private static final String SERVER_ADDRESS = "ServerAddress";
+  static final String SERVER_ADDRESS = "ServerAddress";
   
-  private static final String CORPUS_NAME = "CorpusName";
+  static final String CORPUS_NAME = "CorpusName";
 
   private static final String SEARCH_QUERY = "SearchQuery";
   
@@ -53,13 +59,20 @@ public class CSQueueCollectionReader ext
   
   private String corpusName;
   
+  private Type idType;
+  private Feature idFeature;
+  
   private Iterator<String> casIds;
 
+  private Logger logger;
+
 
   @Override
   public void initialize() throws ResourceInitializationException {
     super.initialize();
     
+    logger = getLogger();
+    
     serverAddress = (String) getConfigParameterValue(SERVER_ADDRESS);
     
     // Retrieve corpus address ...
@@ -83,6 +96,15 @@ public class CSQueueCollectionReader ext
           // TODO: How to fix this? Shouldn't accept do it?
           .header("Content-Type", MediaType.TEXT_XML)
           .post(ClientResponse.class);
+      
+      if (response.getStatus() != 204) {
+    	  throw new ResourceInitializationException(
+    			  new Exception("Failed to create queue: " + response.getStatus()));
+      }
+      
+      if (logger.isLoggable(Level.INFO)) {
+        logger.log(Level.INFO, "Sucessfully created queue: " + queueName + " for corpus: " + corpusName);
+      }
     }
     
     // Retrieve queue link ...
@@ -93,6 +115,8 @@ public class CSQueueCollectionReader ext
     WebResource r = client.resource(serverAddress +  "/queues/" + queueName);
     
     while (true) {
+      System.out.println("Requesting next CAS ID!");
+    	
       // TODO: Make query configurable ...
       ClientResponse response = r
               .path("_nextTask")
@@ -109,16 +133,30 @@ public class CSQueueCollectionReader ext
       }
       String casId = response.getEntity(String.class);
       casIdList.add(casId);
+      
+      System.out.println("Received CAS ID: " + casId);
     }
     
     casIds = casIdList.iterator();
   }
   
   @Override
+  public void typeSystemInit(TypeSystem ts)
+      throws ResourceInitializationException {
+    super.typeSystemInit(ts);
+    
+    String idTypeName = (String) getConfigParameterValue("IdFSTypeName");
+    idType = ts.getType(idTypeName);
+    String idFeatureName = (String) getConfigParameterValue("IdFeatureName");
+    idFeature = idType.getFeatureByBaseName(idFeatureName);
+  }
+  
+  @Override
   public void getNext(CAS cas) throws IOException, CollectionException {
 	  
     String casId = casIds.next();
 	
+    
     Client client = Client.create();
     
     WebResource corpusWebResource = client.resource(serverAddress + "/corpora/" + corpusName);
@@ -130,10 +168,26 @@ public class CSQueueCollectionReader ext
         .get(ClientResponse.class);
     
     InputStream casIn = casResponse.getEntityInputStream();
-	  
-    UimaUtil.deserializeXmiCAS(cas, casIn);
     
-    casIn.close();
+    try {
+      UimaUtil.deserializeXmiCAS(cas, casIn);
+    }
+    catch (IOException e) {
+      if (logger.isLoggable(Level.SEVERE)) {
+        logger.log(Level.SEVERE, "Failed to load CAS: " +  casId + " code: " + casResponse.getStatus());
+      }
+      
+      throw e;
+    }
+    finally {
+      casIn.close();
+    }
+    
+    if (idType != null && idFeature != null) {
+      FeatureStructure idFS = cas.createFS(idType);
+      idFS.setStringValue(idFeature, casId);
+      cas.addFsToIndexes(idFS);
+    }
   }
 
   @Override