You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/08/06 14:48:40 UTC
svn commit: r1369821 - in /opennlp/sandbox/corpus-server-connector: desc/
src/main/java/org/apache/opennlp/corpus_server/connector/
Author: joern
Date: Mon Aug 6 12:48:40 2012
New Revision: 1369821
URL: http://svn.apache.org/viewvc?rev=1369821&view=rev
Log:
OPENNLP-261 Implemented CAS write support.
Modified:
opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml
opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml
opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java
opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java
Modified: opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml (original)
+++ opennlp/sandbox/corpus-server-connector/desc/CSCasWriter.xml Mon Aug 6 12:48:40 2012
@@ -1,68 +1,60 @@
<?xml version="1.0" encoding="UTF-8"?>
- <!--
- ***************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- ***************************************************************
- -->
-
+<!-- *************************************************************** * Licensed
+ to the Apache Software Foundation (ASF) under one * or more contributor license
+ agreements. See the NOTICE file * distributed with this work for additional
+ information * regarding copyright ownership. The ASF licenses this file *
+ to you under the Apache License, Version 2.0 (the * "License"); you may not
+ use this file except in compliance * with the License. You may obtain a copy
+ of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless
+ required by applicable law or agreed to in writing, * software distributed
+ under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES
+ OR CONDITIONS OF ANY * KIND, either express or implied. See the License for
+ the * specific language governing permissions and limitations * under the
+ License. *************************************************************** -->
+
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
- <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
- <primitive>true</primitive>
- <annotatorImplementationName>org.apache.opennlp.corpus_server.connector.CSCasWriter</annotatorImplementationName>
- <analysisEngineMetaData>
- <name>Corpus Server Cas Writer</name>
- <description>Writes a CAS (back) to the Corpus Server.</description>
- <configurationParameters>
- <configurationParameter>
- <name>CorpusAddress</name>
- <description>Directory containing input files</description>
- <type>String</type>
- <multiValued>false</multiValued>
- <mandatory>true</mandatory>
- </configurationParameter>
- </configurationParameters>
- <configurationParameterSettings>
- <nameValuePair>
- <name>CorpusAddress</name>
- <value>
- <string>http://localhost:8080/corpus-server/rest/corpora/wikinews</string>
- </value>
- </nameValuePair>
- </configurationParameterSettings>
-
- <!-- TODO: Can TS be imported via http?! Otherwise it must be downloaded by the user! -->
- <typeSystemDescription>
- <imports>
- <import name="org.apache.uima.examples.SourceDocumentInformation"/>
- </imports>
- </typeSystemDescription>
- <capabilities>
- <capability>
- <inputs/>
- <outputs/>
- <languagesSupported/>
- </capability>
- </capabilities>
- <operationalProperties>
- <modifiesCas>true</modifiesCas>
- <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
- <outputsNewCASes>false</outputsNewCASes>
- </operationalProperties>
- </analysisEngineMetaData>
+ <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+ <primitive>true</primitive>
+ <annotatorImplementationName>org.apache.opennlp.corpus_server.connector.CSCasWriter
+ </annotatorImplementationName>
+ <analysisEngineMetaData>
+ <name>Corpus Server Cas Writer</name>
+ <description>Writes a CAS (back) to the Corpus Server.</description>
+ <configurationParameters>
+ <configurationParameter>
+ <name>ServerAddress</name>
+ <description>The location of the server.</description>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>CorpusName</name>
+ <description>The id of the corpus which should be processed.
+ </description>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ </configurationParameters>
+
+ <typeSystemDescription>
+ <imports>
+ <import name="org.apache.uima.examples.SourceDocumentInformation" />
+ </imports>
+ </typeSystemDescription>
+ <capabilities>
+ <capability>
+ <inputs />
+ <outputs />
+ <languagesSupported />
+ </capability>
+ </capabilities>
+ <operationalProperties>
+ <modifiesCas>true</modifiesCas>
+ <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+ <outputsNewCASes>false</outputsNewCASes>
+ </operationalProperties>
+ </analysisEngineMetaData>
</analysisEngineDescription>
Modified: opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml (original)
+++ opennlp/sandbox/corpus-server-connector/desc/CSQueueCollectionReader.xml Mon Aug 6 12:48:40 2012
@@ -63,13 +63,13 @@
<nameValuePair>
<name>ServerAddress</name>
<value>
- <string>http://localhost:8080/corpus-server/rest</string>
+ <string>http://localhost:8080/rest</string>
</value>
</nameValuePair>
<nameValuePair>
<name>CorpusName</name>
<value>
- <string>wikinews</string>
+ <string>enwikinews</string>
</value>
</nameValuePair>
<nameValuePair>
Modified: opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java (original)
+++ opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSCasWriter.java Mon Aug 6 12:48:40 2012
@@ -19,42 +19,121 @@ package org.apache.opennlp.corpus_server
import java.io.ByteArrayOutputStream;
+import javax.ws.rs.core.MediaType;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+import org.xml.sax.SAXException;
+
+import com.sun.jersey.api.client.Client;
+import com.sun.jersey.api.client.ClientResponse;
+import com.sun.jersey.api.client.WebResource;
+import com.sun.jersey.api.client.WebResource.Builder;
/**
- * a {@link org.apache.uima.collection.CasConsumer} which puts a passed {@link CAS}
- * inside a {@link CorpusStore}
+ * The CSCasWriter writes a CAS into a Corpus Server.
*/
-public class CSCasWriter extends CasConsumer_ImplBase {
+public class CSCasWriter extends CasAnnotator_ImplBase {
- private static final String CORPUSNAME = "corpusName";
+ private String serverAddress;
+ private String corpusName;
-// private CorpusStore corpusStore;
+ // TODO: Make it configurable
+ private String action = "update";
+
+ private Type idType;
+ private Feature idFeature;
+ private Logger logger;
@Override
- public void initialize() throws ResourceInitializationException {
- super.initialize();
-// String corpusName = String.valueOf(getConfigParameterValue(CORPUSNAME));
-// try {
-// corpusStore = CorpusServer.getInstance().getStore().getCorpus(corpusName);
-// } catch (IOException e) {
-// throw new ResourceInitializationException(e);
-// }
+ public void initialize(UimaContext context)
+ throws ResourceInitializationException {
+ super.initialize(context);
+
+ serverAddress = (String) context.getConfigParameterValue(CSQueueCollectionReader.SERVER_ADDRESS);
+ corpusName = (String) context.getConfigParameterValue(CSQueueCollectionReader.CORPUS_NAME);
+
+ logger = context.getLogger();
}
@Override
- public void processCas(CAS cas) throws ResourceProcessException {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
-// try {
-// XmiCasSerializer.serialize(cas, os);
-// corpusStore.addCAS(String.valueOf(cas.getDocumentAnnotation().getCoveredText().hashCode()), os.toByteArray());
-// } catch (Exception e) {
-// throw new ResourceProcessException(e);
-// }
-
+ public void typeSystemInit(TypeSystem ts)
+ throws AnalysisEngineProcessException {
+ super.typeSystemInit(ts);
+
+ String idTypeName = (String) getContext().getConfigParameterValue("IdFSTypeName");
+ idType = ts.getType(idTypeName);
+ String idFeatureName = (String) getContext().getConfigParameterValue("IdFeatureName");
+ idFeature = idType.getFeatureByBaseName(idFeatureName);
}
-}
\ No newline at end of file
+ @Override
+ public void process(CAS cas) throws AnalysisEngineProcessException {
+
+ FSIterator<FeatureStructure> typeFSIter = cas.getIndexRepository().getAllIndexedFS(idType);
+
+ if (typeFSIter.hasNext()) {
+ FeatureStructure idFs = typeFSIter.next();
+
+ String casId = idFs.getFeatureValueAsString(idFeature);
+
+ // TODO: Remove the FS here, so its client side only!
+ // Was inserted in the reader ...
+ cas.removeFsFromIndexes(idFs);
+
+ ByteArrayOutputStream xmiBytes = new ByteArrayOutputStream();
+ XmiCasSerializer serializer = new XmiCasSerializer(cas.getTypeSystem());
+ try {
+ serializer.serialize(cas, xmiBytes);
+ } catch (SAXException e) {
+ throw new AnalysisEngineProcessException();
+ }
+
+ Client client = Client.create();
+
+ WebResource corpusWebResource = client.resource(serverAddress + "/corpora/"
+ + corpusName);
+
+ Builder casResponseBuilder = corpusWebResource.path(casId)
+ .accept(MediaType.TEXT_XML).header("Content-Type", MediaType.TEXT_XML);
+
+ ClientResponse response;
+ if ("add".equals(action)) {
+ response = casResponseBuilder.post(ClientResponse.class, xmiBytes);
+ }
+ else if ("update".equals(action)) {
+ response = casResponseBuilder.put(ClientResponse.class, xmiBytes);
+ }
+ else {
+ throw new AnalysisEngineProcessException(new Exception("Unkown action: " + action));
+ }
+
+ int statusCode = response.getStatus();
+
+ if (statusCode > 400) {
+ if (logger.isLoggable(Level.SEVERE)) {
+ logger.log(Level.SEVERE, "Error (" + statusCode + "), " + action + ", " + casId);
+ }
+ }
+ else {
+ if (logger.isLoggable(Level.FINE)) {
+ logger.log(Level.FINE, "OK (" + statusCode + "), " + action + ", " + casId);
+ }
+ }
+ }
+ else {
+ throw new AnalysisEngineProcessException(new Exception("Missing Id Feature Structure!"));
+ }
+ }
+}
Modified: opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java?rev=1369821&r1=1369820&r2=1369821&view=diff
==============================================================================
--- opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java (original)
+++ opennlp/sandbox/corpus-server-connector/src/main/java/org/apache/opennlp/corpus_server/connector/CSQueueCollectionReader.java Mon Aug 6 12:48:40 2012
@@ -26,9 +26,15 @@ import java.util.List;
import javax.ws.rs.core.MediaType;
import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
import org.apache.uima.util.Progress;
import com.sun.jersey.api.client.Client;
@@ -41,9 +47,9 @@ import com.sun.jersey.api.client.WebReso
*/
public class CSQueueCollectionReader extends CollectionReader_ImplBase {
- private static final String SERVER_ADDRESS = "ServerAddress";
+ static final String SERVER_ADDRESS = "ServerAddress";
- private static final String CORPUS_NAME = "CorpusName";
+ static final String CORPUS_NAME = "CorpusName";
private static final String SEARCH_QUERY = "SearchQuery";
@@ -53,13 +59,20 @@ public class CSQueueCollectionReader ext
private String corpusName;
+ private Type idType;
+ private Feature idFeature;
+
private Iterator<String> casIds;
+ private Logger logger;
+
@Override
public void initialize() throws ResourceInitializationException {
super.initialize();
+ logger = getLogger();
+
serverAddress = (String) getConfigParameterValue(SERVER_ADDRESS);
// Retrieve corpus address ...
@@ -83,6 +96,15 @@ public class CSQueueCollectionReader ext
// TODO: How to fix this? Shouldn't accept do it?
.header("Content-Type", MediaType.TEXT_XML)
.post(ClientResponse.class);
+
+ if (response.getStatus() != 204) {
+ throw new ResourceInitializationException(
+ new Exception("Failed to create queue: " + response.getStatus()));
+ }
+
+ if (logger.isLoggable(Level.INFO)) {
+ logger.log(Level.INFO, "Sucessfully created queue: " + queueName + " for corpus: " + corpusName);
+ }
}
// Retrieve queue link ...
@@ -93,6 +115,8 @@ public class CSQueueCollectionReader ext
WebResource r = client.resource(serverAddress + "/queues/" + queueName);
while (true) {
+ System.out.println("Requesting next CAS ID!");
+
// TODO: Make query configurable ...
ClientResponse response = r
.path("_nextTask")
@@ -109,16 +133,30 @@ public class CSQueueCollectionReader ext
}
String casId = response.getEntity(String.class);
casIdList.add(casId);
+
+ System.out.println("Received CAS ID: " + casId);
}
casIds = casIdList.iterator();
}
@Override
+ public void typeSystemInit(TypeSystem ts)
+ throws ResourceInitializationException {
+ super.typeSystemInit(ts);
+
+ String idTypeName = (String) getConfigParameterValue("IdFSTypeName");
+ idType = ts.getType(idTypeName);
+ String idFeatureName = (String) getConfigParameterValue("IdFeatureName");
+ idFeature = idType.getFeatureByBaseName(idFeatureName);
+ }
+
+ @Override
public void getNext(CAS cas) throws IOException, CollectionException {
String casId = casIds.next();
+
Client client = Client.create();
WebResource corpusWebResource = client.resource(serverAddress + "/corpora/" + corpusName);
@@ -130,10 +168,26 @@ public class CSQueueCollectionReader ext
.get(ClientResponse.class);
InputStream casIn = casResponse.getEntityInputStream();
-
- UimaUtil.deserializeXmiCAS(cas, casIn);
- casIn.close();
+ try {
+ UimaUtil.deserializeXmiCAS(cas, casIn);
+ }
+ catch (IOException e) {
+ if (logger.isLoggable(Level.SEVERE)) {
+ logger.log(Level.SEVERE, "Failed to load CAS: " + casId + " code: " + casResponse.getStatus());
+ }
+
+ throw e;
+ }
+ finally {
+ casIn.close();
+ }
+
+ if (idType != null && idFeature != null) {
+ FeatureStructure idFS = cas.createFS(idType);
+ idFS.setStringValue(idFeature, casId);
+ cas.addFsToIndexes(idFS);
+ }
}
@Override