You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2009/01/08 00:26:47 UTC
svn commit: r732555 [2/2] - in /incubator/uima/sandbox/trunk/TikaAnnotator:
./ desc/ src/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/uima/
src/main/java/org/apache/uima/tika/
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.Annotation_Type;
+
+/**
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * @generated */
+public class MarkupAnnotation_Type extends Annotation_Type {
+ /** @generated */
+ protected FSGenerator getFSGenerator() {return fsGenerator;}
+ /** @generated */
+ private final FSGenerator fsGenerator =
+ new FSGenerator() {
+ public FeatureStructure createFS(int addr, CASImpl cas) {
+ if (MarkupAnnotation_Type.this.useExistingInstance) {
+ // Return eq fs instance if already created
+ FeatureStructure fs = MarkupAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+ if (null == fs) {
+ fs = new MarkupAnnotation(addr, MarkupAnnotation_Type.this);
+ MarkupAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+ return fs;
+ }
+ return fs;
+ } else return new MarkupAnnotation(addr, MarkupAnnotation_Type.this);
+ }
+ };
+ /** @generated */
+ public final static int typeIndexID = MarkupAnnotation.typeIndexID;
+ /** @generated
+ @modifiable */
+ public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.MarkupAnnotation");
+
+ /** @generated */
+ final Feature casFeat_attributes;
+ /** @generated */
+ final int casFeatCode_attributes;
+ /** @generated */
+ public int getAttributes(int addr) {
+ if (featOkTst && casFeat_attributes == null)
+ jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getRefValue(addr, casFeatCode_attributes);
+ }
+ /** @generated */
+ public void setAttributes(int addr, int v) {
+ if (featOkTst && casFeat_attributes == null)
+ jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setRefValue(addr, casFeatCode_attributes, v);}
+
+ /** @generated */
+ public int getAttributes(int addr, int i) {
+ if (featOkTst && casFeat_attributes == null)
+ jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+ if (lowLevelTypeChecks)
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+ }
+
+ /** @generated */
+ public void setAttributes(int addr, int i, int v) {
+ if (featOkTst && casFeat_attributes == null)
+ jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+ if (lowLevelTypeChecks)
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, v, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, v);
+ }
+
+
+ /** @generated */
+ final Feature casFeat_children;
+ /** @generated */
+ final int casFeatCode_children;
+ /** @generated */
+ public int getChildren(int addr) {
+ if (featOkTst && casFeat_children == null)
+ jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getRefValue(addr, casFeatCode_children);
+ }
+ /** @generated */
+ public void setChildren(int addr, int v) {
+ if (featOkTst && casFeat_children == null)
+ jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setRefValue(addr, casFeatCode_children, v);}
+
+ /** @generated */
+ public int getChildren(int addr, int i) {
+ if (featOkTst && casFeat_children == null)
+ jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+ if (lowLevelTypeChecks)
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+ }
+
+ /** @generated */
+ public void setChildren(int addr, int i, int v) {
+ if (featOkTst && casFeat_children == null)
+ jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+ if (lowLevelTypeChecks)
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, v, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, v);
+ }
+
+
+ /** @generated */
+ final Feature casFeat_name;
+ /** @generated */
+ final int casFeatCode_name;
+ /** @generated */
+ public String getName(int addr) {
+ if (featOkTst && casFeat_name == null)
+ jcas.throwFeatMissing("name", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_name);
+ }
+ /** @generated */
+ public void setName(int addr, String v) {
+ if (featOkTst && casFeat_name == null)
+ jcas.throwFeatMissing("name", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setStringValue(addr, casFeatCode_name, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_parent;
+ /** @generated */
+ final int casFeatCode_parent;
+ /** @generated */
+ public int getParent(int addr) {
+ if (featOkTst && casFeat_parent == null)
+ jcas.throwFeatMissing("parent", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getRefValue(addr, casFeatCode_parent);
+ }
+ /** @generated */
+ public void setParent(int addr, int v) {
+ if (featOkTst && casFeat_parent == null)
+ jcas.throwFeatMissing("parent", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setRefValue(addr, casFeatCode_parent, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_qualifiedName;
+ /** @generated */
+ final int casFeatCode_qualifiedName;
+ /** @generated */
+ public String getQualifiedName(int addr) {
+ if (featOkTst && casFeat_qualifiedName == null)
+ jcas.throwFeatMissing("qualifiedName", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_qualifiedName);
+ }
+ /** @generated */
+ public void setQualifiedName(int addr, String v) {
+ if (featOkTst && casFeat_qualifiedName == null)
+ jcas.throwFeatMissing("qualifiedName", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setStringValue(addr, casFeatCode_qualifiedName, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_uri;
+ /** @generated */
+ final int casFeatCode_uri;
+ /** @generated */
+ public String getUri(int addr) {
+ if (featOkTst && casFeat_uri == null)
+ jcas.throwFeatMissing("uri", "org.apache.uima.MarkupAnnotation");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_uri);
+ }
+ /** @generated */
+ public void setUri(int addr, String v) {
+ if (featOkTst && casFeat_uri == null)
+ jcas.throwFeatMissing("uri", "org.apache.uima.MarkupAnnotation");
+ ll_cas.ll_setStringValue(addr, casFeatCode_uri, v);}
+
+
+
+
+
+ /** initialize variables to correspond with Cas Type and Features
+ * @generated */
+ public MarkupAnnotation_Type(JCas jcas, Type casType) {
+ super(jcas, casType);
+ casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+
+ casFeat_attributes = jcas.getRequiredFeatureDE(casType, "attributes", "uima.cas.FSArray", featOkTst);
+ casFeatCode_attributes = (null == casFeat_attributes) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_attributes).getCode();
+
+
+ casFeat_children = jcas.getRequiredFeatureDE(casType, "children", "uima.cas.FSArray", featOkTst);
+ casFeatCode_children = (null == casFeat_children) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_children).getCode();
+
+
+ casFeat_name = jcas.getRequiredFeatureDE(casType, "name", "uima.cas.String", featOkTst);
+ casFeatCode_name = (null == casFeat_name) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_name).getCode();
+
+
+ casFeat_parent = jcas.getRequiredFeatureDE(casType, "parent", "org.apache.uima.MarkupAnnotation", featOkTst);
+ casFeatCode_parent = (null == casFeat_parent) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_parent).getCode();
+
+
+ casFeat_qualifiedName = jcas.getRequiredFeatureDE(casType, "qualifiedName", "uima.cas.String", featOkTst);
+ casFeatCode_qualifiedName = (null == casFeat_qualifiedName) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_qualifiedName).getCode();
+
+
+ casFeat_uri = jcas.getRequiredFeatureDE(casType, "uri", "uima.cas.String", featOkTst);
+ casFeatCode_uri = (null == casFeat_uri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_uri).getCode();
+
+ }
+}
+
+
+
+
\ No newline at end of file
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.uima.FeatureValue;
+import org.apache.uima.SourceDocumentAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+
+
+/** Uses TIKA to convert original markup into UIMA annotations**/
+public class MarkupAnnotator extends CasAnnotator_ImplBase {
+
+
+ private final static String ORIGINAL_VIEW_PARAM_NAME = "ORIGINAL_VIEW_PARAM_NAME";
+ private final static String TEXT_VIEW_PARAM_NAME = "TEXT_VIEW_PARAM_NAME";
+ private final static String SET_TEXT_VIEW_DEFAULT_PARAM_NAME = "SET_TEXT_VIEW_DEFAULT_PARAM_NAME";
+
+ private final static String tika_file_param = "tikaConfigFile";
+
+ // takes an option indicating the name of the view containing the binary document
+ private String originalViewName = "_InitialView";
+
+ // takes an option indicating the name of the view containing the text version of the document
+ private String textViewName = "textView";
+
+ // whether to make the text view default or not
+ private Boolean makeTextDefaultView = true;
+
+ // configuration for TIKA - can be created by specifying a custom resource
+ private TikaConfig config = null;
+
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ // Get config param setting
+ originalViewName = (String) aContext.getConfigParameterValue(ORIGINAL_VIEW_PARAM_NAME);
+
+ textViewName = (String) aContext.getConfigParameterValue(TEXT_VIEW_PARAM_NAME);
+ if (textViewName==null) {
+ System.err.println("Parameter TEXT_VIEW_PARAM_NAME is null; setting to \"textView\"");
+ textViewName = "textView";
+ }
+ else System.err.println("Parameter TEXT_VIEW_PARAM_NAME is "+textViewName);
+
+ makeTextDefaultView = (Boolean) aContext.getConfigParameterValue(SET_TEXT_VIEW_DEFAULT_PARAM_NAME);
+ if (makeTextDefaultView==null) {
+ System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is null; setting to \"true\"");
+ makeTextDefaultView = new Boolean(true);
+ }
+ else System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is "+makeTextDefaultView);
+
+ // initialise TIKA parser
+ // try to get a custom config
+ URL tikaConfigURL = null;
+ try {
+ tikaConfigURL = getContext().getResourceURL(tika_file_param);
+ config = new TikaConfig(tikaConfigURL);
+ } catch (Exception e1) {
+ // to log
+ System.err.println("Failed to load TIKA config file from "+tikaConfigURL);
+ config = null;
+ }
+
+ // if not rely on default one
+ if (config==null){
+ try {
+ config = TikaConfig.getDefaultConfig();
+ } catch (TikaException e) {
+ throw new ResourceInitializationException(e);
+ }
+ }
+
+ }
+
+ public void process(CAS cas) throws AnalysisEngineProcessException {
+ CAS originalCas = null;
+ try {
+ originalCas = cas.getView(originalViewName);
+ }
+ catch (Exception e){
+ String viewName = cas.getViewName();
+ // can't find originalViewName
+ System.err.println("can't find view "+originalViewName+" using "+viewName+" instead");
+ originalCas = cas.getCurrentView();
+ }
+
+ InputStream originalStream = originalCas.getSofa().getSofaDataStream();
+
+ String lang = null;
+
+ // parsing with TIKA
+
+ // TODO if content type is known then we use it
+ // otherwise we guess
+
+ Parser parser = new AutoDetectParser(config);
+
+ Metadata md = new Metadata();
+ MarkupHandler handler = new MarkupHandler();
+
+ try {
+ parser.parse(originalStream,handler , md);
+ }
+ catch (Exception e){
+ // if we have a problem just dump the message and continue
+ System.err.println("Problem converting file : "+e.getMessage());
+ // PROBLEM => trying to serialize binary content in XML crash!
+ return;
+ }
+ finally {
+ try {
+ originalStream.close();
+ } catch (IOException e) {
+ }
+ }
+
+ CAS plainTextView = cas.createView(textViewName);
+
+
+ handler.populateCAS(plainTextView);
+ plainTextView.setDocumentLanguage(lang);
+
+ // get additional metadata about the document
+ // e.g content type etc...
+ // TODO add possibility to define type as parameter and discover
+ // feature names on the fly
+ JCas ptv=null;
+ try {
+ ptv = plainTextView.getJCas();
+ } catch (CASException e) {
+ e.printStackTrace();
+ return;
+ }
+
+ Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.SourceDocumentAnnotation");
+ Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
+ SourceDocumentAnnotation docAnnotation = null;
+ // do we already have one?
+ if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
+ // otherwise let's create a new annotation
+ else docAnnotation = new SourceDocumentAnnotation(ptv);
+
+ // now iterate on the metadata found by Tika and add them to the info
+ if (docAnnotation.getFeatures()==null)
+ docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size())) ;
+
+ for (int i=0;i<md.size();i++){
+ String name = md.names()[i];
+ String value = md.get(name);
+ FeatureValue fv = new FeatureValue(ptv);
+ fv.setName(name);
+ fv.setValue(value);
+ docAnnotation.setFeatures(i,fv);
+ }
+ docAnnotation.addToIndexes();
+
+ }
+
+}
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.AttributeFS;
+import org.apache.uima.MarkupAnnotation;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+/*******************************************************************************
+ * SAX Handler which gets events from the Tika parser events and create UIMA
+ * annotations accordingly.
+ *
+ ******************************************************************************/
+
+public class MarkupHandler implements ContentHandler {
+
+ private StringBuffer textBuffer;
+
+ private List<ProtoAnnotation> protoAnnotations;
+
+ private LinkedList<ProtoAnnotation> startedAnnotations;
+
+ public MarkupHandler() {
+ textBuffer = new StringBuffer();
+ protoAnnotations = new LinkedList<ProtoAnnotation>();
+ startedAnnotations = new LinkedList<ProtoAnnotation>();
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // MS doc spits out funny characters
+ // we replace them with ' '
+ for (int c = start;c<start+length;c++){
+ if (!Character.isISOControl(ch[c])) continue;
+ if (Character.isWhitespace(ch[c])) continue;
+ ch[c] = ' ';
+ }
+
+ // store the characters in the textBuffer
+ textBuffer.append(ch, start, length);
+ }
+
+ public void startDocument() throws SAXException {
+ }
+
+ public void endDocument() throws SAXException {
+ // there should be no annotation left at this stage
+ if (startedAnnotations.size() != 0) {
+ // TODO log + error message
+ }
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ int startOffset = textBuffer.length();
+
+ ProtoAnnotation proto = new ProtoAnnotation(uri,localName, qName,atts, startOffset);
+ this.startedAnnotations.addLast(proto);
+ }
+
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ int endOffset = textBuffer.length();
+
+ // try to get the corresponding annotation
+ // we start from the last temporary
+ // and go up the stack
+ Iterator<ProtoAnnotation> iter = startedAnnotations.iterator();
+ ProtoAnnotation startedAnnot = null;
+ while (iter.hasNext()){
+ ProtoAnnotation temp = iter.next();
+ if (temp.getLocalName().equals(localName)){
+ startedAnnot = temp;
+ break;
+ }
+ }
+ // found something?
+ if (startedAnnot==null){
+ // TODO log etc...
+ return;
+ }
+
+ startedAnnot.setEnd(endOffset);
+ startedAnnotations.remove(startedAnnot);
+ protoAnnotations.add(startedAnnot);
+
+ // add a \n otherwise we get everything
+ // on a single line
+ textBuffer.append("\n");
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ }
+
+ // the following methods are simply ignored
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ }
+
+ public void populateCAS(CAS cas){
+ // set the text
+ cas.setDocumentText(this.textBuffer.toString());
+
+ Type markupType = cas.getTypeSystem().getType("org.apache.uima.MarkupAnnotation");
+ Type attributeType = cas.getTypeSystem().getType("org.apache.uima.AttributeFS");
+
+ JCas jcas;
+ try {
+ jcas = cas.getJCas();
+ } catch (CASException e) {
+ throw new RuntimeException(e);
+ }
+
+ // now convert the proto annotations into real ones
+ for (ProtoAnnotation proto : protoAnnotations) {
+ MarkupAnnotation markup = new MarkupAnnotation(jcas);
+ markup.setBegin(proto.getStart());
+ markup.setEnd(proto.getEnd());
+ // generate attributes
+ Attributes protoAttributes = proto.getAtts();
+ FSArray attribs = (FSArray) cas.createArrayFS(protoAttributes.getLength());
+ for (int index=0; index< protoAttributes.getLength();index++){
+ org.apache.uima.AttributeFS afs = (AttributeFS) cas.createFS(attributeType);
+ afs.setLocalName(protoAttributes.getLocalName(index));
+ afs.setQualifiedName(protoAttributes.getQName(index));
+ afs.setUri(protoAttributes.getURI(index));
+ afs.setValue(protoAttributes.getValue(index));
+ afs.addToIndexes();
+ attribs.set(index, afs);
+ }
+ markup.setAttributes(attribs);
+ markup.setUri(proto.getUri());
+ markup.setName(proto.getLocalName());
+ markup.setQualifiedName(proto.getQName());
+ markup.addToIndexes();
+ }
+ }
+
+
+}
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import org.xml.sax.Attributes;
+
+/**
+ * Neutral representation of an annotation which can be converted into a proper GATE or UIMA annotation later
+ ***/
+
+public class ProtoAnnotation {
+
+ private String uri;
+ private String localName;
+ private String qName;
+ private Attributes atts;
+ private int start;
+ private int end;
+
+ public ProtoAnnotation(String uri, String localName, String qName, Attributes atts, int start) {
+ super();
+ this.uri = uri;
+ this.localName = localName;
+ this.qName = qName;
+ this.atts = atts;
+ this.start = start;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+ public void setEnd(int end) {
+ this.end = end;
+ }
+
+ public Attributes getAtts() {
+ return atts;
+ }
+
+ public String getLocalName() {
+ return localName;
+ }
+
+ public String getQName() {
+ return qName;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+
+
+}
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+
+
+/**
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * XML source: /data/gate-plugins/UIMAAnnotationReader/desc/MarkupAnnotationTypeSystem.xml
+ * @generated */
+public class SourceDocumentAnnotation extends DocumentAnnotation {
+ /** @generated
+ * @ordered
+ */
+ public final static int typeIndexID = JCasRegistry.register(SourceDocumentAnnotation.class);
+ /** @generated
+ * @ordered
+ */
+ public final static int type = typeIndexID;
+ /** @generated */
+ public int getTypeIndexID() {return typeIndexID;}
+
+ /** Never called. Disable default constructor
+ * @generated */
+ protected SourceDocumentAnnotation() {}
+
+ /** Internal - constructor used by generator
+ * @generated */
+ public SourceDocumentAnnotation(int addr, TOP_Type type) {
+ super(addr, type);
+ readObject();
+ }
+
+ /** @generated */
+ public SourceDocumentAnnotation(JCas jcas) {
+ super(jcas);
+ readObject();
+ }
+
+ /** @generated */
+ public SourceDocumentAnnotation(JCas jcas, int begin, int end) {
+ super(jcas);
+ setBegin(begin);
+ setEnd(end);
+ readObject();
+ }
+
+ /** <!-- begin-user-doc -->
+ * Write your own initialization here
+ * <!-- end-user-doc -->
+ @generated modifiable */
+ private void readObject() {}
+
+
+
+ //*--------------*
+ //* Feature: uri
+
+ /** getter for uri - gets
+ * @generated */
+ public String getUri() {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_uri == null)
+ jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+ return jcasType.ll_cas.ll_getStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_uri);}
+
+ /** setter for uri - sets
+ * @generated */
+ public void setUri(String v) {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_uri == null)
+ jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+ jcasType.ll_cas.ll_setStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_uri, v);}
+
+
+ //*--------------*
+ //* Feature: contentType
+
+ /** getter for contentType - gets
+ * @generated */
+ public String getContentType() {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_contentType == null)
+ jcasType.jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+ return jcasType.ll_cas.ll_getStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_contentType);}
+
+ /** setter for contentType - sets
+ * @generated */
+ public void setContentType(String v) {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_contentType == null)
+ jcasType.jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+ jcasType.ll_cas.ll_setStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_contentType, v);}
+
+
+ //*--------------*
+ //* Feature: features
+
+ /** getter for features - gets
+ * @generated */
+ public FSArray getFeatures() {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+ jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ return (FSArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features)));}
+
+ /** setter for features - sets
+ * @generated */
+ public void setFeatures(FSArray v) {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+ jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ jcasType.ll_cas.ll_setRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features, jcasType.ll_cas.ll_getFSRef(v));}
+
+ /** indexed getter for features - gets an indexed value -
+ * @generated */
+ public FeatureValue getFeatures(int i) {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+ jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i);
+ return (FeatureValue)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i)));}
+
+ /** indexed setter for features - sets an indexed value -
+ * @generated */
+ public void setFeatures(int i, FeatureValue v) {
+ if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+ jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i);
+ jcasType.ll_cas.ll_setRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i, jcasType.ll_cas.ll_getFSRef(v));}
+ }
+
+
\ No newline at end of file
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.DocumentAnnotation_Type;
+
+/**
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * @generated */
+public class SourceDocumentAnnotation_Type extends DocumentAnnotation_Type {
+ /** @generated */
+ protected FSGenerator getFSGenerator() {return fsGenerator;}
+ /** @generated */
+ private final FSGenerator fsGenerator =
+ new FSGenerator() {
+ public FeatureStructure createFS(int addr, CASImpl cas) {
+ if (SourceDocumentAnnotation_Type.this.useExistingInstance) {
+ // Return eq fs instance if already created
+ FeatureStructure fs = SourceDocumentAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+ if (null == fs) {
+ fs = new SourceDocumentAnnotation(addr, SourceDocumentAnnotation_Type.this);
+ SourceDocumentAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+ return fs;
+ }
+ return fs;
+ } else return new SourceDocumentAnnotation(addr, SourceDocumentAnnotation_Type.this);
+ }
+ };
+ /** @generated */
+ public final static int typeIndexID = SourceDocumentAnnotation.typeIndexID;
+ /** @generated
+ @modifiable */
+ public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.SourceDocumentAnnotation");
+
+ /** @generated */
+ final Feature casFeat_uri;
+ /** @generated */
+ final int casFeatCode_uri;
+ /** @generated */
+ public String getUri(int addr) {
+ if (featOkTst && casFeat_uri == null)
+ jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_uri);
+ }
+ /** @generated */
+ public void setUri(int addr, String v) {
+ if (featOkTst && casFeat_uri == null)
+ jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+ ll_cas.ll_setStringValue(addr, casFeatCode_uri, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_contentType;
+ /** @generated */
+ final int casFeatCode_contentType;
+ /** @generated */
+ public String getContentType(int addr) {
+ if (featOkTst && casFeat_contentType == null)
+ jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_contentType);
+ }
+ /** @generated */
+ public void setContentType(int addr, String v) {
+ if (featOkTst && casFeat_contentType == null)
+ jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+ ll_cas.ll_setStringValue(addr, casFeatCode_contentType, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_features;
+ /** @generated */
+ final int casFeatCode_features;
+ /** @generated */
+ public int getFeatures(int addr) {
+ if (featOkTst && casFeat_features == null)
+ jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ return ll_cas.ll_getRefValue(addr, casFeatCode_features);
+ }
+ /** @generated */
+ public void setFeatures(int addr, int v) {
+ if (featOkTst && casFeat_features == null)
+ jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ ll_cas.ll_setRefValue(addr, casFeatCode_features, v);}
+
+ /** @generated */
+ public int getFeatures(int addr, int i) {
+ if (featOkTst && casFeat_features == null)
+ jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ if (lowLevelTypeChecks)
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+ return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+ }
+
+ /** @generated */
+ public void setFeatures(int addr, int i, int v) {
+ if (featOkTst && casFeat_features == null)
+ jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+ if (lowLevelTypeChecks)
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, v, true);
+ jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+ ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, v);
+ }
+
+
+
+
+ /** initialize variables to correspond with Cas Type and Features
+ * @generated */
+ public SourceDocumentAnnotation_Type(JCas jcas, Type casType) {
+ super(jcas, casType);
+ casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+
+ casFeat_uri = jcas.getRequiredFeatureDE(casType, "uri", "uima.cas.String", featOkTst);
+ casFeatCode_uri = (null == casFeat_uri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_uri).getCode();
+
+
+ casFeat_contentType = jcas.getRequiredFeatureDE(casType, "contentType", "uima.cas.String", featOkTst);
+ casFeatCode_contentType = (null == casFeat_contentType) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_contentType).getCode();
+
+
+ casFeat_features = jcas.getRequiredFeatureDE(casType, "features", "uima.cas.FSArray", featOkTst);
+ casFeatCode_features = (null == casFeat_features) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_features).getCode();
+
+ }
+}
+
+
+
+
\ No newline at end of file
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java Wed Jan 7 15:26:46 2009
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.uima.FeatureValue;
+import org.apache.uima.SourceDocumentAnnotation;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+
+public class TIKAWrapper {
+
+ // configuration for TIKA - can be created by specifying a custom resource
+ private TikaConfig config = null;
+
+ public TIKAWrapper()throws TikaException{
+ config = TikaConfig.getDefaultConfig();
+ }
+
+ public TIKAWrapper(String configLocation) throws TikaException{
+ if (configLocation!=null)
+ try {
+ config = new TikaConfig(configLocation);
+ } catch (Exception e) {}
+ if (config==null)
+ config = TikaConfig.getDefaultConfig();
+ }
+
+
+ public void populateCASfromURL(CAS cas, URL url, String language) throws CASException{
+ populateCASfromURL(cas, url, null, language);
+ }
+
+ public void populateCASfromURL(CAS cas, URL url, String mime, String language) throws CASException{
+
+ InputStream originalStream=null;
+ try {
+ originalStream = new BufferedInputStream(url
+ .openStream());
+ } catch (IOException e1) {
+ new CASException(e1);
+ }
+
+ // use custom parser or rely on autodetect
+ Parser parser = null;
+
+ if (mime!=null && mime.equals("")==false)
+ parser = config.getParser(mime);
+
+ // it that does not work
+ if (parser == null) {parser = new AutoDetectParser(config);}
+
+ Metadata md = new Metadata();
+ MarkupHandler handler = new MarkupHandler();
+
+ try {
+ parser.parse(originalStream,handler , md);
+ }
+ catch (Exception e){
+ // if we have a problem just dump the message and continue
+ // getLogger().log(Level.WARNING,"Problem converting file : "+URI+"\t"+e.getMessage());
+ cas.setDocumentText("");
+ return;
+ }
+ finally {
+ // set language if it was explicitly specified as a configuration
+ // parameter
+ if (language != null) {
+ cas.setDocumentLanguage(language);
+ }
+ try {
+ originalStream.close();
+ } catch (IOException e) {
+ }
+ }
+
+ // add text and markup to CAS
+ handler.populateCAS(cas);
+
+ JCas jcas = cas.getJCas();
+
+ SourceDocumentAnnotation docAnnotation = new SourceDocumentAnnotation(jcas);
+
+ // now iterate on the metadata found by Tika and add them to the info
+ if (docAnnotation.getFeatures()==null){
+ docAnnotation.setFeatures((FSArray) cas
+ .createArrayFS(md.size()+1)) ;
+ }
+ int i=0;
+ for (;i<md.size();i++){
+ String name = md.names()[i];
+ String value = md.get(name);
+ FeatureValue fv = new FeatureValue(cas.getJCas());
+ fv.setName(name);
+ fv.setValue(value);
+ // getLogger().log(Level.FINER,URI+"\t"+name+"\t"+value);
+ docAnnotation.setFeatures(i,fv);
+ }
+
+ FeatureValue fv = new FeatureValue(jcas);
+ fv.setName("uri");
+ fv.setValue(url.toString());
+ docAnnotation.setFeatures(i,fv);
+
+ docAnnotation.addToIndexes();
+ }
+}
\ No newline at end of file
Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
------------------------------------------------------------------------------
svn:mime-type = text/plain