You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2009/01/08 00:26:47 UTC

svn commit: r732555 [2/2] - in /incubator/uima/sandbox/trunk/TikaAnnotator: ./ desc/ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/uima/ src/main/java/org/apache/uima/tika/

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.Annotation_Type;
+
+/** 
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * @generated */
+public class MarkupAnnotation_Type extends Annotation_Type {
+  /** @generated */
+  protected FSGenerator getFSGenerator() {return fsGenerator;}
+  /** @generated */
+  private final FSGenerator fsGenerator = 
+    new FSGenerator() {
+      public FeatureStructure createFS(int addr, CASImpl cas) {
+  			 if (MarkupAnnotation_Type.this.useExistingInstance) {
+  			   // Return eq fs instance if already created
+  		     FeatureStructure fs = MarkupAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+  		     if (null == fs) {
+  		       fs = new MarkupAnnotation(addr, MarkupAnnotation_Type.this);
+  			   MarkupAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+  			   return fs;
+  		     }
+  		     return fs;
+        } else return new MarkupAnnotation(addr, MarkupAnnotation_Type.this);
+  	  }
+    };
+  /** @generated */
+  public final static int typeIndexID = MarkupAnnotation.typeIndexID;
+  /** @generated 
+     @modifiable */
+  public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.MarkupAnnotation");
+ 
+  /** @generated */
+  final Feature casFeat_attributes;
+  /** @generated */
+  final int     casFeatCode_attributes;
+  /** @generated */ 
+  public int getAttributes(int addr) {
+        if (featOkTst && casFeat_attributes == null)
+      jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_attributes);
+  }
+  /** @generated */    
+  public void setAttributes(int addr, int v) {
+        if (featOkTst && casFeat_attributes == null)
+      jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setRefValue(addr, casFeatCode_attributes, v);}
+    
+   /** @generated */
+  public int getAttributes(int addr, int i) {
+        if (featOkTst && casFeat_attributes == null)
+      jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+    if (lowLevelTypeChecks)
+      return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+	return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+  }
+   
+  /** @generated */ 
+  public void setAttributes(int addr, int i, int v) {
+        if (featOkTst && casFeat_attributes == null)
+      jcas.throwFeatMissing("attributes", "org.apache.uima.MarkupAnnotation");
+    if (lowLevelTypeChecks)
+      ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, v, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i);
+    ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_attributes), i, v);
+  }
+ 
+ 
+  /** @generated */
+  final Feature casFeat_children;
+  /** @generated */
+  final int     casFeatCode_children;
+  /** @generated */ 
+  public int getChildren(int addr) {
+        if (featOkTst && casFeat_children == null)
+      jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_children);
+  }
+  /** @generated */    
+  public void setChildren(int addr, int v) {
+        if (featOkTst && casFeat_children == null)
+      jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setRefValue(addr, casFeatCode_children, v);}
+    
+   /** @generated */
+  public int getChildren(int addr, int i) {
+        if (featOkTst && casFeat_children == null)
+      jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+    if (lowLevelTypeChecks)
+      return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+	return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+  }
+   
+  /** @generated */ 
+  public void setChildren(int addr, int i, int v) {
+        if (featOkTst && casFeat_children == null)
+      jcas.throwFeatMissing("children", "org.apache.uima.MarkupAnnotation");
+    if (lowLevelTypeChecks)
+      ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, v, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_children), i);
+    ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_children), i, v);
+  }
+ 
+ 
+  /** @generated */
+  final Feature casFeat_name;
+  /** @generated */
+  final int     casFeatCode_name;
+  /** @generated */ 
+  public String getName(int addr) {
+        if (featOkTst && casFeat_name == null)
+      jcas.throwFeatMissing("name", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_name);
+  }
+  /** @generated */    
+  public void setName(int addr, String v) {
+        if (featOkTst && casFeat_name == null)
+      jcas.throwFeatMissing("name", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_name, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_parent;
+  /** @generated */
+  final int     casFeatCode_parent;
+  /** @generated */ 
+  public int getParent(int addr) {
+        if (featOkTst && casFeat_parent == null)
+      jcas.throwFeatMissing("parent", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_parent);
+  }
+  /** @generated */    
+  public void setParent(int addr, int v) {
+        if (featOkTst && casFeat_parent == null)
+      jcas.throwFeatMissing("parent", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setRefValue(addr, casFeatCode_parent, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_qualifiedName;
+  /** @generated */
+  final int     casFeatCode_qualifiedName;
+  /** @generated */ 
+  public String getQualifiedName(int addr) {
+        if (featOkTst && casFeat_qualifiedName == null)
+      jcas.throwFeatMissing("qualifiedName", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_qualifiedName);
+  }
+  /** @generated */    
+  public void setQualifiedName(int addr, String v) {
+        if (featOkTst && casFeat_qualifiedName == null)
+      jcas.throwFeatMissing("qualifiedName", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_qualifiedName, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_uri;
+  /** @generated */
+  final int     casFeatCode_uri;
+  /** @generated */ 
+  public String getUri(int addr) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.MarkupAnnotation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_uri);
+  }
+  /** @generated */    
+  public void setUri(int addr, String v) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.MarkupAnnotation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_uri, v);}
+    
+  
+
+
+
+  /** initialize variables to correspond with Cas Type and Features
+	* @generated */
+  public MarkupAnnotation_Type(JCas jcas, Type casType) {
+    super(jcas, casType);
+    casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+ 
+    casFeat_attributes = jcas.getRequiredFeatureDE(casType, "attributes", "uima.cas.FSArray", featOkTst);
+    casFeatCode_attributes  = (null == casFeat_attributes) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_attributes).getCode();
+
+ 
+    casFeat_children = jcas.getRequiredFeatureDE(casType, "children", "uima.cas.FSArray", featOkTst);
+    casFeatCode_children  = (null == casFeat_children) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_children).getCode();
+
+ 
+    casFeat_name = jcas.getRequiredFeatureDE(casType, "name", "uima.cas.String", featOkTst);
+    casFeatCode_name  = (null == casFeat_name) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_name).getCode();
+
+ 
+    casFeat_parent = jcas.getRequiredFeatureDE(casType, "parent", "org.apache.uima.MarkupAnnotation", featOkTst);
+    casFeatCode_parent  = (null == casFeat_parent) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_parent).getCode();
+
+ 
+    casFeat_qualifiedName = jcas.getRequiredFeatureDE(casType, "qualifiedName", "uima.cas.String", featOkTst);
+    casFeatCode_qualifiedName  = (null == casFeat_qualifiedName) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_qualifiedName).getCode();
+
+ 
+    casFeat_uri = jcas.getRequiredFeatureDE(casType, "uri", "uima.cas.String", featOkTst);
+    casFeatCode_uri  = (null == casFeat_uri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_uri).getCode();
+
+  }
+}
+
+
+
+    
\ No newline at end of file

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotation_Type.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Iterator;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.uima.FeatureValue;
+import org.apache.uima.SourceDocumentAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+
+
+/** Uses TIKA to convert original markup into UIMA annotations**/
+public class MarkupAnnotator extends CasAnnotator_ImplBase {
+
+
+	private final static String ORIGINAL_VIEW_PARAM_NAME = "ORIGINAL_VIEW_PARAM_NAME";
+	private final static String TEXT_VIEW_PARAM_NAME = "TEXT_VIEW_PARAM_NAME";
+	private final static String SET_TEXT_VIEW_DEFAULT_PARAM_NAME = "SET_TEXT_VIEW_DEFAULT_PARAM_NAME";
+	
+	private final static String tika_file_param = "tikaConfigFile";
+	
+	// takes an option indicating the name of the view containing the binary document
+	private String originalViewName = "_InitialView";
+	
+	// takes an option indicating the name of the view containing the text version of the document
+	private String textViewName = "textView";
+	
+	// whether to make the text view default or not
+	private Boolean makeTextDefaultView = true;
+	
+	// configuration for TIKA - can be created by specifying a custom resource
+	private TikaConfig config = null;
+	
+	public void initialize(UimaContext aContext) throws ResourceInitializationException {
+		super.initialize(aContext);
+		// Get config param setting
+		originalViewName  = (String) aContext.getConfigParameterValue(ORIGINAL_VIEW_PARAM_NAME);
+
+		textViewName = (String) aContext.getConfigParameterValue(TEXT_VIEW_PARAM_NAME);
+		if (textViewName==null) {
+			System.err.println("Parameter TEXT_VIEW_PARAM_NAME is null; setting to \"textView\"");
+			textViewName = "textView";
+		}
+		else System.err.println("Parameter TEXT_VIEW_PARAM_NAME is "+textViewName);
+		
+		makeTextDefaultView = (Boolean) aContext.getConfigParameterValue(SET_TEXT_VIEW_DEFAULT_PARAM_NAME);
+		if (makeTextDefaultView==null) {
+			System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is null; setting to \"true\"");
+			makeTextDefaultView = new Boolean(true);
+		}
+		else System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is "+makeTextDefaultView);
+		
+		// initialise TIKA parser
+		// try to get a custom config
+		URL tikaConfigURL = null;
+		try {
+			tikaConfigURL = getContext().getResourceURL(tika_file_param);
+			config = new TikaConfig(tikaConfigURL);
+		} catch (Exception e1) {
+			// to log
+			System.err.println("Failed to load TIKA config file from "+tikaConfigURL);
+			config = null;
+		}
+
+		// if not rely on default one
+		if (config==null){
+			try {
+				config = TikaConfig.getDefaultConfig();
+			} catch (TikaException e) {
+				throw new ResourceInitializationException(e);
+			}
+		}
+		
+	}
+	
+	public void process(CAS cas) throws AnalysisEngineProcessException {
+	    CAS originalCas = null;
+	    try {
+	    originalCas = cas.getView(originalViewName);
+	    }
+	    catch (Exception e){
+	    	String viewName = cas.getViewName();
+	    	// can't find originalViewName
+	    	System.err.println("can't find view "+originalViewName+" using "+viewName+" instead");
+	    	originalCas = cas.getCurrentView();
+	    }
+	    
+	    InputStream originalStream = originalCas.getSofa().getSofaDataStream();
+		
+	    String lang = null;
+	    
+	    // parsing with TIKA
+	    
+	    // TODO if content type is known then we use it 
+	    // otherwise we guess
+	    
+	    Parser parser = new AutoDetectParser(config);
+
+	    Metadata md = new Metadata();
+	    MarkupHandler handler  = new MarkupHandler();		  
+
+	    try {
+	    	parser.parse(originalStream,handler , md);
+	    }
+	    catch (Exception e){
+	    	// if we have a problem just dump the message and continue
+	    	System.err.println("Problem converting file : "+e.getMessage());
+	    	// PROBLEM => trying to serialize binary content in XML crash!
+	    	return;
+	    }
+	    finally {
+	    	try {
+				originalStream.close();
+			} catch (IOException e) {
+			}
+	    }
+	    
+	    CAS plainTextView = cas.createView(textViewName);
+	    
+
+	    handler.populateCAS(plainTextView);
+	    plainTextView.setDocumentLanguage(lang);
+	    
+	    // get additional metadata about the document
+	    // e.g content type etc...
+	    // TODO add possibility to define type as parameter and discover
+	    // feature names on the fly
+	    JCas ptv=null;
+		try {
+			ptv = plainTextView.getJCas();
+		} catch (CASException e) {
+			e.printStackTrace();
+			return;
+		}
+	    
+	    Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.SourceDocumentAnnotation");
+	    Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
+	    SourceDocumentAnnotation docAnnotation = null;
+	    // do we already have one?
+	    if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
+	    // otherwise let's create a new annotation
+	    else docAnnotation = new SourceDocumentAnnotation(ptv);
+	    
+	    // now iterate on the metadata found by Tika and add them to the info
+	    if (docAnnotation.getFeatures()==null)
+	    	docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size())) ;
+	    
+	    for (int i=0;i<md.size();i++){
+	    	String name = md.names()[i];
+	    	String value = md.get(name);
+	    	FeatureValue fv = new FeatureValue(ptv);
+	    	fv.setName(name);
+	    	fv.setValue(value);
+	    	docAnnotation.setFeatures(i,fv);
+	    }
+	    docAnnotation.addToIndexes();
+	   
+	}
+
+}

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.AttributeFS;
+import org.apache.uima.MarkupAnnotation;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+/*******************************************************************************
+ * SAX Handler which gets events from the Tika parser events and create UIMA
+ * annotations accordingly.
+ * 
+ ******************************************************************************/
+
+public class MarkupHandler implements ContentHandler {
+
+	private StringBuffer textBuffer;
+
+	private List<ProtoAnnotation> protoAnnotations;
+
+	private LinkedList<ProtoAnnotation> startedAnnotations;
+
+	public MarkupHandler() {
+		textBuffer = new StringBuffer();
+		protoAnnotations = new LinkedList<ProtoAnnotation>();
+		startedAnnotations = new LinkedList<ProtoAnnotation>();
+	}
+
+	public void characters(char[] ch, int start, int length)
+			throws SAXException {
+		// MS doc spits out funny characters
+		// we replace them with ' ' 
+		for (int c = start;c<start+length;c++){
+			if (!Character.isISOControl(ch[c])) continue;
+			if (Character.isWhitespace(ch[c])) continue;	
+			ch[c] = ' ';
+		}
+		
+		// store the characters in the textBuffer
+		textBuffer.append(ch, start, length);
+	}
+
+	public void startDocument() throws SAXException {
+	}
+
+	public void endDocument() throws SAXException {
+		// there should be no annotation left at this stage
+		if (startedAnnotations.size() != 0) {
+			// TODO log + error message
+		}
+	}
+
+	public void startElement(String uri, String localName, String qName,
+			Attributes atts) throws SAXException {
+		int startOffset = textBuffer.length();
+		
+		ProtoAnnotation proto = new ProtoAnnotation(uri,localName, qName,atts, startOffset);
+		this.startedAnnotations.addLast(proto);
+	}
+
+	public void endElement(String uri, String localName, String qName)
+			throws SAXException {
+		int endOffset = textBuffer.length();
+		
+		// try to get the corresponding annotation
+		// we start from the last temporary 
+		// and go up the stack
+		Iterator<ProtoAnnotation> iter = startedAnnotations.iterator();
+		ProtoAnnotation startedAnnot = null;
+		while (iter.hasNext()){
+			ProtoAnnotation temp = iter.next();
+			if (temp.getLocalName().equals(localName)){
+				startedAnnot = temp;
+				break;
+			}
+		}
+		// found something?
+		if (startedAnnot==null){
+			// TODO log etc...
+			return;
+		}
+		
+		startedAnnot.setEnd(endOffset);
+		startedAnnotations.remove(startedAnnot);
+		protoAnnotations.add(startedAnnot);
+		
+		// add a \n otherwise we get everything 
+		// on a single line
+		textBuffer.append("\n");
+	}
+
+	public void ignorableWhitespace(char[] ch, int start, int length)
+			throws SAXException {
+	}
+
+	// the following methods are simply ignored
+
+	public void startPrefixMapping(String prefix, String uri)
+			throws SAXException {
+	}
+
+	public void endPrefixMapping(String prefix) throws SAXException {
+	}
+
+	public void setDocumentLocator(Locator locator) {
+	}
+
+	public void skippedEntity(String name) throws SAXException {
+	}
+
+	public void processingInstruction(String target, String data)
+			throws SAXException {
+	}
+
+	public void populateCAS(CAS cas){
+		// set the text 
+		cas.setDocumentText(this.textBuffer.toString());
+		
+		Type markupType = cas.getTypeSystem().getType("org.apache.uima.MarkupAnnotation");
+		Type attributeType = cas.getTypeSystem().getType("org.apache.uima.AttributeFS");
+		
+		JCas jcas;
+		try {
+			jcas = cas.getJCas();
+		} catch (CASException e) {
+			throw new RuntimeException(e);
+		}
+		
+		// now convert the proto annotations into real ones
+		for (ProtoAnnotation proto : protoAnnotations) {
+			MarkupAnnotation markup = new MarkupAnnotation(jcas);
+			markup.setBegin(proto.getStart());
+			markup.setEnd(proto.getEnd());
+			// generate attributes
+			Attributes protoAttributes = proto.getAtts();
+			FSArray attribs = (FSArray) cas.createArrayFS(protoAttributes.getLength());
+			for (int index=0; index< protoAttributes.getLength();index++){
+				org.apache.uima.AttributeFS afs = (AttributeFS) cas.createFS(attributeType);
+				afs.setLocalName(protoAttributes.getLocalName(index));
+				afs.setQualifiedName(protoAttributes.getQName(index));
+				afs.setUri(protoAttributes.getURI(index));
+				afs.setValue(protoAttributes.getValue(index));
+				afs.addToIndexes();
+				attribs.set(index, afs);
+			}
+			markup.setAttributes(attribs);
+			markup.setUri(proto.getUri());
+			markup.setName(proto.getLocalName());
+			markup.setQualifiedName(proto.getQName());
+			markup.addToIndexes();
+		}
+	}
+	
+
+}

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupHandler.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import org.xml.sax.Attributes;
+
+/** 
+ * Neutral representation of an annotation which can be converted into a proper GATE or UIMA annotation later
+ ***/
+
+public class ProtoAnnotation {
+
+	private String uri;
+	private String localName;
+	private String qName;
+	private Attributes atts;
+	private int start;
+	private int end;
+	
+	public ProtoAnnotation(String uri, String localName, String qName, Attributes atts, int start) {
+		super();
+		this.uri = uri;
+		this.localName = localName;
+		this.qName = qName;
+		this.atts = atts;
+		this.start = start;
+	}
+
+	public int getEnd() {
+		return end;
+	}
+
+	public void setEnd(int end) {
+		this.end = end;
+	}
+
+	public Attributes getAtts() {
+		return atts;
+	}
+
+	public String getLocalName() {
+		return localName;
+	}
+
+	public String getQName() {
+		return qName;
+	}
+
+	public int getStart() {
+		return start;
+	}
+
+	public String getUri() {
+		return uri;
+	}
+	
+	
+	
+}

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/ProtoAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas; 
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+
+
+/** 
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * XML source: /data/gate-plugins/UIMAAnnotationReader/desc/MarkupAnnotationTypeSystem.xml
+ * @generated */
+public class SourceDocumentAnnotation extends DocumentAnnotation {
+  /** @generated
+   * @ordered 
+   */
+  public final static int typeIndexID = JCasRegistry.register(SourceDocumentAnnotation.class);
+  /** @generated
+   * @ordered 
+   */
+  public final static int type = typeIndexID;
+  /** @generated  */
+  public              int getTypeIndexID() {return typeIndexID;}
+ 
+  /** Never called.  Disable default constructor
+   * @generated */
+  protected SourceDocumentAnnotation() {}
+    
+  /** Internal - constructor used by generator 
+   * @generated */
+  public SourceDocumentAnnotation(int addr, TOP_Type type) {
+    super(addr, type);
+    readObject();
+  }
+  
+  /** @generated */
+  public SourceDocumentAnnotation(JCas jcas) {
+    super(jcas);
+    readObject();   
+  } 
+
+  /** @generated */  
+  public SourceDocumentAnnotation(JCas jcas, int begin, int end) {
+    super(jcas);
+    setBegin(begin);
+    setEnd(end);
+    readObject();
+  }   
+
+  /** <!-- begin-user-doc -->
+    * Write your own initialization here
+    * <!-- end-user-doc -->
+  @generated modifiable */
+  private void readObject() {}
+     
+ 
+    
+  //*--------------*
+  //* Feature: uri
+
+  /** getter for uri - gets 
+   * @generated */
+  public String getUri() {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_uri == null)
+      jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+    return jcasType.ll_cas.ll_getStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_uri);}
+    
+  /** setter for uri - sets  
+   * @generated */
+  public void setUri(String v) {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_uri == null)
+      jcasType.jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+    jcasType.ll_cas.ll_setStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_uri, v);}    
+   
+    
+  //*--------------*
+  //* Feature: contentType
+
+  /** getter for contentType - gets 
+   * @generated */
+  public String getContentType() {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_contentType == null)
+      jcasType.jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+    return jcasType.ll_cas.ll_getStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_contentType);}
+    
+  /** setter for contentType - sets  
+   * @generated */
+  public void setContentType(String v) {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_contentType == null)
+      jcasType.jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+    jcasType.ll_cas.ll_setStringValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_contentType, v);}    
+   
+    
+  //*--------------*
+  //* Feature: features
+
+  /** getter for features - gets 
+   * @generated */
+  public FSArray getFeatures() {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+      jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    return (FSArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features)));}
+    
+  /** setter for features - sets  
+   * @generated */
+  public void setFeatures(FSArray v) {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+      jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    jcasType.ll_cas.ll_setRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features, jcasType.ll_cas.ll_getFSRef(v));}    
+    
+  /** indexed getter for features - gets an indexed value - 
+   * @generated */
+  public FeatureValue getFeatures(int i) {
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+      jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i);
+    return (FeatureValue)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i)));}
+
+  /** indexed setter for features - sets an indexed value - 
+   * @generated */
+  public void setFeatures(int i, FeatureValue v) { 
+    if (SourceDocumentAnnotation_Type.featOkTst && ((SourceDocumentAnnotation_Type)jcasType).casFeat_features == null)
+      jcasType.jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i);
+    jcasType.ll_cas.ll_setRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((SourceDocumentAnnotation_Type)jcasType).casFeatCode_features), i, jcasType.ll_cas.ll_getFSRef(v));}
+  }
+
+    
\ No newline at end of file

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.DocumentAnnotation_Type;
+
+/** 
+ * Updated by JCasGen Thu Sep 18 08:31:44 BST 2008
+ * @generated */
+public class SourceDocumentAnnotation_Type extends DocumentAnnotation_Type {
+  /** @generated */
+  protected FSGenerator getFSGenerator() {return fsGenerator;}
+  /** @generated */
+  private final FSGenerator fsGenerator = 
+    new FSGenerator() {
+      public FeatureStructure createFS(int addr, CASImpl cas) {
+  			 if (SourceDocumentAnnotation_Type.this.useExistingInstance) {
+  			   // Return eq fs instance if already created
+  		     FeatureStructure fs = SourceDocumentAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+  		     if (null == fs) {
+  		       fs = new SourceDocumentAnnotation(addr, SourceDocumentAnnotation_Type.this);
+  			   SourceDocumentAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+  			   return fs;
+  		     }
+  		     return fs;
+        } else return new SourceDocumentAnnotation(addr, SourceDocumentAnnotation_Type.this);
+  	  }
+    };
+  /** @generated */
+  public final static int typeIndexID = SourceDocumentAnnotation.typeIndexID;
+  /** @generated 
+     @modifiable */
+  public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.SourceDocumentAnnotation");
+ 
+  /** @generated */
+  final Feature casFeat_uri;
+  /** @generated */
+  final int     casFeatCode_uri;
+  /** @generated */ 
+  public String getUri(int addr) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_uri);
+  }
+  /** @generated */    
+  public void setUri(int addr, String v) {
+        if (featOkTst && casFeat_uri == null)
+      jcas.throwFeatMissing("uri", "org.apache.uima.SourceDocumentAnnotation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_uri, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_contentType;
+  /** @generated */
+  final int     casFeatCode_contentType;
+  /** @generated */ 
+  public String getContentType(int addr) {
+        if (featOkTst && casFeat_contentType == null)
+      jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_contentType);
+  }
+  /** @generated */    
+  public void setContentType(int addr, String v) {
+        if (featOkTst && casFeat_contentType == null)
+      jcas.throwFeatMissing("contentType", "org.apache.uima.SourceDocumentAnnotation");
+    ll_cas.ll_setStringValue(addr, casFeatCode_contentType, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_features;
+  /** @generated */
+  final int     casFeatCode_features;
+  /** @generated */ 
+  public int getFeatures(int addr) {
+        if (featOkTst && casFeat_features == null)
+      jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_features);
+  }
+  /** @generated */    
+  public void setFeatures(int addr, int v) {
+        if (featOkTst && casFeat_features == null)
+      jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    ll_cas.ll_setRefValue(addr, casFeatCode_features, v);}
+    
+   /** @generated */
+  public int getFeatures(int addr, int i) {
+        if (featOkTst && casFeat_features == null)
+      jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    if (lowLevelTypeChecks)
+      return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+	return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+  }
+   
+  /** @generated */ 
+  public void setFeatures(int addr, int i, int v) {
+        if (featOkTst && casFeat_features == null)
+      jcas.throwFeatMissing("features", "org.apache.uima.SourceDocumentAnnotation");
+    if (lowLevelTypeChecks)
+      ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, v, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_features), i);
+    ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_features), i, v);
+  }
+ 
+
+
+
+  /** initialize variables to correspond with Cas Type and Features
+	* @generated */
+  public SourceDocumentAnnotation_Type(JCas jcas, Type casType) {
+    super(jcas, casType);
+    casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+ 
+    casFeat_uri = jcas.getRequiredFeatureDE(casType, "uri", "uima.cas.String", featOkTst);
+    casFeatCode_uri  = (null == casFeat_uri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_uri).getCode();
+
+ 
+    casFeat_contentType = jcas.getRequiredFeatureDE(casType, "contentType", "uima.cas.String", featOkTst);
+    casFeatCode_contentType  = (null == casFeat_contentType) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_contentType).getCode();
+
+ 
+    casFeat_features = jcas.getRequiredFeatureDE(casType, "features", "uima.cas.FSArray", featOkTst);
+    casFeatCode_features  = (null == casFeat_features) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_features).getCode();
+
+  }
+}
+
+
+
+    
\ No newline at end of file

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/SourceDocumentAnnotation_Type.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java?rev=732555&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java (added)
+++ incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java Wed Jan  7 15:26:46 2009
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.tika;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.uima.FeatureValue;
+import org.apache.uima.SourceDocumentAnnotation;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+
+public class TIKAWrapper {
+	
+	// configuration for TIKA - can be created by specifying a custom resource
+	private TikaConfig config = null;
+	
+	public TIKAWrapper()throws TikaException{
+		config = TikaConfig.getDefaultConfig();
+	}
+	
+	public TIKAWrapper(String configLocation) throws TikaException{
+		if (configLocation!=null)
+			try {
+				config = new TikaConfig(configLocation);
+			} catch (Exception e) {} 
+		if (config==null)	
+		config = TikaConfig.getDefaultConfig();
+	}
+	
+	
+	public void populateCASfromURL(CAS cas, URL url, String language) throws CASException{
+		populateCASfromURL(cas, url, null, language);
+	}
+	
+	public void populateCASfromURL(CAS cas, URL url, String mime, String language) throws CASException{
+	
+		InputStream originalStream=null;
+		try {
+			originalStream = new BufferedInputStream(url
+					.openStream());
+		} catch (IOException e1) {
+			new CASException(e1);
+		}
+		
+		// use custom parser or rely on autodetect
+		Parser parser = null;
+		  
+		if (mime!=null  && mime.equals("")==false)
+			parser = config.getParser(mime);
+          
+		// it that does not work
+        if (parser == null) {parser = new AutoDetectParser(config);}
+
+	    Metadata md = new Metadata();
+	    MarkupHandler handler  = new MarkupHandler();		  
+
+	    try {
+	    	parser.parse(originalStream,handler , md);
+	    }
+	    catch (Exception e){
+	    	// if we have a problem just dump the message and continue
+	    	// getLogger().log(Level.WARNING,"Problem converting file : "+URI+"\t"+e.getMessage());
+	    	cas.setDocumentText("");
+	    	return;
+	    }
+	    finally {
+			// set language if it was explicitly specified as a configuration
+			// parameter
+			if (language != null) {
+				cas.setDocumentLanguage(language);
+			}
+			try {
+				originalStream.close();
+			} catch (IOException e) {
+			}
+	    }
+	    
+		// add text and markup to CAS
+	    handler.populateCAS(cas);
+
+	    JCas jcas  = cas.getJCas();
+	    
+	    SourceDocumentAnnotation docAnnotation = new SourceDocumentAnnotation(jcas);
+	    
+	    // now iterate on the metadata found by Tika and add them to the info
+	    if (docAnnotation.getFeatures()==null){
+	    	docAnnotation.setFeatures((FSArray) cas
+					.createArrayFS(md.size()+1)) ;
+	    }
+	    int i=0;
+	    for (;i<md.size();i++){
+	    	String name = md.names()[i];
+	    	String value = md.get(name);
+	    	FeatureValue fv = new FeatureValue(cas.getJCas());
+	    	fv.setName(name);
+	    	fv.setValue(value);
+	    	// getLogger().log(Level.FINER,URI+"\t"+name+"\t"+value);
+	    	docAnnotation.setFeatures(i,fv);
+	    }
+	    
+	    FeatureValue fv = new FeatureValue(jcas);
+    	fv.setName("uri");
+    	fv.setValue(url.toString());
+    	docAnnotation.setFeatures(i,fv);
+	    
+	    docAnnotation.addToIndexes();
+	}
+}
\ No newline at end of file

Propchange: incubator/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain