You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [23/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates a {@see Parser} and leverages on 
+ * {@see CTAKESContentHandler} to extract biomedical information from 
+ * clinical text using Apache cTAKES.
+ * <p>It is normally called by supplying an instance to 
+ *  {@link AutoDetectParser}, such as:
+ * <code>AutoDetectParser parser = new AutoDetectParser(new CTAKESParser());</code>
+ * <p>It can also be used by giving a Tika Config file similar to:
+ * <code>
+ *  <properties>
+ *    <parsers>
+ *      <parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+ *        <parser class="org.apache.tika.parser.DefaultParser"/>
+ *      </parser>
+ *    </parsers>
+ *  </properties>
+ * </code>
+ * <p>Because this is a Parser Decorator, and not a normal Parser in
+ *  it's own right, it isn't normally selected via the Parser Service Loader.
+ */
+public class CTAKESParser extends ParserDecorator {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -2313482748027097961L;
+
+    /**
+     * Wraps the default Parser
+     */
+    public CTAKESParser() {
+        this(TikaConfig.getDefaultConfig());
+    }
+    /**
+     * Wraps the default Parser for this Config
+     */
+    public CTAKESParser(TikaConfig config) {
+        this(config.getParser());
+    }
+    /**
+     * Wraps the specified Parser
+     */
+    public CTAKESParser(Parser parser) {
+        super(parser);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        CTAKESConfig config = context.get(CTAKESConfig.class,
+                new CTAKESConfig());
+        CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler,
+                metadata, config);
+        super.parse(stream, ctakesHandler, metadata, context);
+    }
+    
+    //@Override
+    public String getDecorationName() {
+        return "CTakes";
+    }            
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ * 
+ * A CAS serializer writes a CAS in the given format.
+ */
+public enum CTAKESSerializer {
+    XCAS(XCASSerializer.class.getName()),
+    XMI(XmiCasSerializer.class.getName()),
+    XML(XmlCasSerializer.class.getName());
+
+    private final String className;
+
+    private CTAKESSerializer(String className) {
+        this.className = className;
+    }
+
+    public String getClassName() {
+        return className;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ * 
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
+ * toolkit.
+ * </p>
+ */
+public class CTAKESUtils {
+	// UMLS username property
+	private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+	// UMLS password property
+	private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+	/**
+	 * Returns a new UIMA Analysis Engine (AE). This method ensures that only
+	 * one instance of an AE is created.
+	 * 
+	 * <p>
+	 * An Analysis Engine is a component responsible for analyzing unstructured
+	 * information, discovering and representing semantic content. Unstructured
+	 * information includes, but is not restricted to, text documents.
+	 * </p>
+	 * 
+	 * @param aeDescriptor
+	 *            pathname for XML file including an AnalysisEngineDescription
+	 *            that contains all of the information needed to instantiate and
+	 *            use an AnalysisEngine.
+	 * @param umlsUser
+	 *            UMLS username for NLM database
+	 * @param umlsPass
+	 *            UMLS password for NLM database
+	 * @return an Analysis Engine for analyzing unstructured information.
+	 * @throws IOException
+	 *             if any I/O error occurs.
+	 * @throws InvalidXMLException
+	 *             if the input XML is not valid or does not specify a valid
+	 *             ResourceSpecifier.
+	 * @throws ResourceInitializationException
+	 *             if a failure occurred during production of the resource.
+	 * @throws URISyntaxException
+	 *             if URL of the resource is not formatted strictly according to
+	 *             to RFC2396 and cannot be converted to a URI.
+	 */
+	public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+			String umlsUser, String umlsPass) throws IOException,
+			InvalidXMLException, ResourceInitializationException,
+			URISyntaxException {
+		// UMLS user ID and password.
+		String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor)
+				.toURI().getPath();
+
+		// get Resource Specifier from XML
+		XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
+		ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+				.parseResourceSpecifier(aeIputSource);
+
+		// UMLS user ID and password
+		if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null)
+				&& (!umlsPass.isEmpty())) {
+			/*
+			 * It is highly recommended that you change UMLS credentials in the
+			 * XML configuration file instead of giving user and password using
+			 * CTAKESConfig.
+			 */
+			System.setProperty(CTAKES_UMLS_USER, umlsUser);
+			System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+		}
+
+		// create AE
+		AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+
+		return ae;
+	}
+
+	/**
+	 * Returns a new JCas () appropriate for the given Analysis Engine. This
+	 * method ensures that only one instance of a JCas is created. A Jcas is a
+	 * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+	 * API.
+	 * 
+	 * <p>
+	 * Important: It is highly recommended that you reuse CAS objects rather
+	 * than creating new CAS objects prior to each analysis. This is because CAS
+	 * objects may be expensive to create and may consume a significant amount
+	 * of memory.
+	 * </p>
+	 * 
+	 * @param ae
+	 *            AnalysisEngine used to create an appropriate JCas object.
+	 * @return a JCas object appropriate for the given AnalysisEngine.
+	 * @throws ResourceInitializationException
+	 *             if a CAS could not be created because this AnalysisEngine's
+	 *             CAS metadata (type system, type priorities, or FS indexes)
+	 *             are invalid.
+	 */
+	public static JCas getJCas(AnalysisEngine ae)
+			throws ResourceInitializationException {
+		JCas jcas = ae.newJCas();
+		
+		return jcas;
+	}
+
+	/**
+	 * Serializes a CAS in the given format.
+	 * 
+	 * @param jcas
+	 *            CAS (Common Analysis System) to be serialized.
+	 * @param type
+	 *            type of cTAKES (UIMA) serializer used to write CAS.
+	 * @param prettyPrint
+	 *            {@code true} to do pretty printing of output.
+	 * @param stream
+	 *            {@see OutputStream} object used to print out information
+	 *            extracted by using cTAKES.
+	 * @throws SAXException
+	 *             if there was a SAX exception.
+	 * @throws IOException
+	 *             if any I/O error occurs.
+	 */
+	public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint,
+			OutputStream stream) throws SAXException, IOException {
+		if (type == CTAKESSerializer.XCAS) {
+			XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
+		} else if (type == CTAKESSerializer.XMI) {
+			XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+					stream, prettyPrint, new XmiSerializationSharedData());
+		} else {
+			XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+					stream);
+		}
+	}
+
+	/**
+	 * Returns the annotation value based on the given annotation type.
+	 * 
+	 * @param annotation
+	 *            {@see IdentifiedAnnotation} object.
+	 * @param property
+	 *            {@see CTAKESAnnotationProperty} enum used to identify the
+	 *            annotation type.
+	 * @return the annotation value.
+	 */
+	public static String getAnnotationProperty(IdentifiedAnnotation annotation,
+			CTAKESAnnotationProperty property) {
+		String value = null;
+		if (property == CTAKESAnnotationProperty.BEGIN) {
+			value = Integer.toString(annotation.getBegin());
+		} else if (property == CTAKESAnnotationProperty.END) {
+			value = Integer.toString(annotation.getEnd());
+		} else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+			value = Boolean.toString(annotation.getConditional());
+		} else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+			value = Float.toString(annotation.getConfidence());
+		} else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+			value = Integer.toString(annotation.getDiscoveryTechnique());
+		} else if (property == CTAKESAnnotationProperty.GENERIC) {
+			value = Boolean.toString(annotation.getGeneric());
+		} else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+			value = Integer.toString(annotation.getHistoryOf());
+		} else if (property == CTAKESAnnotationProperty.ID) {
+			value = Integer.toString(annotation.getId());
+		} else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+			FSArray mentions = annotation.getOntologyConceptArr();
+			StringBuilder sb = new StringBuilder();
+			if (mentions != null) {
+				for (int i = 0; i < mentions.size(); i++) {
+					if (mentions.get(i) instanceof UmlsConcept) {
+						UmlsConcept concept = (UmlsConcept) mentions.get(i);
+						sb.append(concept.getCui());
+						if (i < mentions.size() - 1) {
+							sb.append(",");
+						}
+					}
+				}
+			}
+			value = sb.toString();
+		} else if (property == CTAKESAnnotationProperty.POLARITY) {
+			value = Integer.toString(annotation.getPolarity());
+		}
+		return value;
+	}
+
+	/**
+	 * Resets cTAKES objects, if created. This method ensures that new cTAKES
+	 * objects (a.k.a., Analysis Engine and JCas) will be created if getters of
+	 * this class are called.
+	 * 
+	 * @param ae UIMA Analysis Engine
+	 * @param jcas JCas object
+	 */
+	public static void reset(AnalysisEngine ae, JCas jcas) {
+		// Analysis Engine
+		resetAE(ae);
+
+		// JCas
+		resetCAS(jcas);
+		jcas = null;
+	}
+
+	/**
+	 * Resets the CAS (Common Analysis System), emptying it of all content.
+	 * 
+	 * @param jcas JCas object
+	 */
+	public static void resetCAS(JCas jcas) {
+		if (jcas != null) {
+			jcas.reset();
+		}
+	}
+
+	/**
+	 * Resets the AE (AnalysisEngine), releasing all resources held by the
+	 * current AE.
+	 * 
+	 * @param ae UIMA Analysis Engine
+	 */
+	public static void resetAE(AnalysisEngine ae) {
+		if (ae != null) {
+			ae.destroy();
+			ae = null;
+		}
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.util.Stack;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DIFContentHandler extends DefaultHandler {
+
+	private static final char[] NEWLINE = new char[] { '\n' };
+	private static final char[] TABSPACE = new char[] { '\t' };
+	private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+	private Stack<String> treeStack;
+	private Stack<String> dataStack;
+	private final ContentHandler delegate;
+	private boolean isLeaf;
+	private Metadata metadata;
+
+	public DIFContentHandler(ContentHandler delegate, Metadata metadata) {
+		this.delegate = delegate;
+		this.isLeaf = false;
+		this.metadata = metadata;
+		this.treeStack = new Stack<String>();
+		this.dataStack = new Stack<String>();
+	}
+
+	@Override
+	public void setDocumentLocator(org.xml.sax.Locator locator) {
+		delegate.setDocumentLocator(locator);
+	}
+
+	@Override
+	public void characters(char[] ch, int start, int length)
+			throws SAXException {
+		String value = (new String(ch, start, length)).toString();
+		this.dataStack.push(value);
+
+		if (this.treeStack.peek().equals("Entry_Title")) {
+			this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES);
+			String title = "Title: ";
+			title = title + value;
+			this.delegate.characters(title.toCharArray(), 0, title.length());
+			this.delegate.endElement("", "h3", "h3");
+		}
+		if (this.treeStack.peek().equals("Southernmost_Latitude")
+				|| this.treeStack.peek().equals("Northernmost_Latitude")
+				|| this.treeStack.peek().equals("Westernmost_Longitude")
+				|| this.treeStack.peek().equals("Easternmost_Longitude")) {
+			this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.startElement("", "tr", "tr", EMPTY_ATTRIBUTES);
+			this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES);
+			String key = this.treeStack.peek() + " : ";
+			this.delegate.characters(key.toCharArray(), 0, key.length());
+			this.delegate.endElement("", "td", "td");
+			this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES);
+			this.delegate.characters(value.toCharArray(), 0, value.length());
+			this.delegate.endElement("", "td", "td");
+			this.delegate.endElement("", "tr", "tr");
+		}
+	}
+
+	@Override
+	public void ignorableWhitespace(char[] ch, int start, int length)
+			throws SAXException {
+		delegate.ignorableWhitespace(ch, start, length);
+	}
+
+	@Override
+	public void startElement(String uri, String localName, String qName,
+			Attributes attributes) throws SAXException {
+		this.isLeaf = true;
+		if (localName.equals("Spatial_Coverage")) {
+			this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES);
+			String value = "Geographic Data: ";
+			this.delegate.characters(value.toCharArray(), 0, value.length());
+			this.delegate.endElement("", "h3", "h3");
+			this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.startElement("", "table", "table", EMPTY_ATTRIBUTES);
+		}
+		this.treeStack.push(localName);
+	}
+
+	@Override
+	public void endElement(String uri, String localName, String qName)
+			throws SAXException {
+		if (localName.equals("Spatial_Coverage")) {
+			this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+			this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+			this.delegate.endElement("", "table", "table");
+		}
+		if (this.isLeaf) {
+			Stack<String> tempStack = (Stack<String>) this.treeStack.clone();
+			String key = "";
+			while (!tempStack.isEmpty()) {
+				if (key.length() == 0) {
+					key = tempStack.pop();
+				} else {
+					key = tempStack.pop() + "-" + key;
+				}
+			}
+			String value = this.dataStack.peek();
+			this.metadata.add(key, value);
+			this.isLeaf = false;
+		}
+		this.treeStack.pop();
+		this.dataStack.pop();
+	}
+
+	@Override
+	public void startDocument() throws SAXException {
+		delegate.startDocument();
+	}
+
+	@Override
+	public void endDocument() throws SAXException {
+		delegate.endDocument();
+	}
+
+	@Override
+	public String toString() {
+		return delegate.toString();
+	}
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DIFParser extends AbstractParser {
+
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 971505521275777826L;
+	private static final Set<MediaType> SUPPORTED_TYPES = Collections
+			.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.application("dif+xml"))));
+
+	@Override
+	public Set<MediaType> getSupportedTypes(ParseContext context) {
+		// TODO Auto-generated method stub
+		return SUPPORTED_TYPES;
+	}
+
+	@Override
+	public void parse(InputStream stream, ContentHandler handler,
+			Metadata metadata, ParseContext context) throws IOException,
+			SAXException, TikaException {
+		// TODO Auto-generated method stub
+		final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+				metadata);
+		xhtml.startDocument();
+		xhtml.startElement("p");
+		TaggedContentHandler tagged = new TaggedContentHandler(handler);
+		try {
+			context.getSAXParser().parse(
+					new CloseShieldInputStream(stream),
+					new OfflineContentHandler(new EmbeddedContentHandler(
+							getContentHandler(tagged, metadata, context))));
+		} catch (SAXException e) {
+			tagged.throwIfCauseOf(e);
+			throw new TikaException("XML parse error", e);
+		} finally {
+			xhtml.endElement("p");
+			xhtml.endDocument();
+		}
+
+	}
+
+	protected ContentHandler getContentHandler(ContentHandler handler,
+			Metadata metadata, ParseContext context) {
+		
+		return new DIFContentHandler(handler, metadata);
+
+	}
+
+}
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ */
+package org.apache.tika.parser.envi;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class EnviHeaderParser extends AbstractParser {
+
+    private static final long serialVersionUID = -1479368523072408091L;
+
+    public static final String ENVI_MIME_TYPE = "application/envi.hdr";
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("envi.hdr"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        // Only outputting the MIME type as metadata
+        metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
+
+        // The following code was taken from the TXTParser
+        // Automatically detect the character encoding
+
+        try (AutoDetectReader reader = new AutoDetectReader(
+                new CloseShieldInputStream(stream), metadata)) {
+            Charset charset = reader.getCharset();
+            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+                    metadata);
+
+            xhtml.startDocument();
+
+            // text contents of the xhtml
+            String line;
+            while ((line = reader.readLine()) != null) {
+                xhtml.startElement("p");
+                xhtml.characters(line);
+                xhtml.endElement("p");
+            }
+
+            xhtml.endDocument();
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.gdal;
+
+//JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;
+
+//Tika imports
+//SAX imports
+
+/**
+ * Wraps execution of the <a href="http//gdal.org/">Geospatial Data Abstraction
+ * Library (GDAL)</a> <code>gdalinfo</code> tool used to extract geospatial
+ * information out of hundreds of geo file formats.
+ * <p/>
+ * The parser requires the installation of GDAL and for <code>gdalinfo</code> to
+ * be located on the path.
+ * <p/>
+ * Basic information (Size, Coordinate System, Bounding Box, Driver, and
+ * resource info) are extracted as metadata, and the remaining metadata patterns
+ * are extracted and added.
+ * <p/>
+ * The output of the command is available from the provided
+ * {@link ContentHandler} in the
+ * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method.
+ */
+public class GDALParser extends AbstractParser {
+
+    private static final long serialVersionUID = -3869130527323941401L;
+
+    private String command;
+
+    public GDALParser() {
+        setCommand("gdalinfo ${INPUT}");
+    }
+
+    public void setCommand(String command) {
+        this.command = command;
+    }
+
+    public String getCommand() {
+        return this.command;
+    }
+
+    public String processCommand(InputStream stream) {
+        TikaInputStream tis = (TikaInputStream) stream;
+        String pCommand = this.command;
+        try {
+            if (this.command.contains(INPUT_FILE_TOKEN)) {
+                pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile()
+                        .getPath());
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+
+        return pCommand;
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        Set<MediaType> types = new HashSet<MediaType>();
+        types.add(MediaType.application("x-netcdf"));
+        types.add(MediaType.application("vrt"));
+        types.add(MediaType.image("geotiff"));
+        types.add(MediaType.image("nitf"));
+        types.add(MediaType.application("x-rpf-toc"));
+        types.add(MediaType.application("x-ecrg-toc"));
+        types.add(MediaType.image("hfa"));
+        types.add(MediaType.image("sar-ceos"));
+        types.add(MediaType.image("ceos"));
+        types.add(MediaType.application("jaxa-pal-sar"));
+        types.add(MediaType.application("gff"));
+        types.add(MediaType.application("elas"));
+        types.add(MediaType.application("aig"));
+        types.add(MediaType.application("aaigrid"));
+        types.add(MediaType.application("grass-ascii-grid"));
+        types.add(MediaType.application("sdts-raster"));
+        types.add(MediaType.application("dted"));
+        types.add(MediaType.image("png"));
+        types.add(MediaType.image("jpeg"));
+        types.add(MediaType.image("raster"));
+        types.add(MediaType.application("jdem"));
+        types.add(MediaType.image("gif"));
+        types.add(MediaType.image("big-gif"));
+        types.add(MediaType.image("envisat"));
+        types.add(MediaType.image("fits"));
+        types.add(MediaType.application("fits"));
+        types.add(MediaType.image("bsb"));
+        types.add(MediaType.application("xpm"));
+        types.add(MediaType.image("bmp"));
+        types.add(MediaType.image("x-dimap"));
+        types.add(MediaType.image("x-airsar"));
+        types.add(MediaType.application("x-rs2"));
+        types.add(MediaType.application("x-pcidsk"));
+        types.add(MediaType.application("pcisdk"));
+        types.add(MediaType.image("x-pcraster"));
+        types.add(MediaType.image("ilwis"));
+        types.add(MediaType.image("sgi"));
+        types.add(MediaType.application("x-srtmhgt"));
+        types.add(MediaType.application("leveller"));
+        types.add(MediaType.application("terragen"));
+        types.add(MediaType.application("x-gmt"));
+        types.add(MediaType.application("x-isis3"));
+        types.add(MediaType.application("x-isis2"));
+        types.add(MediaType.application("x-pds"));
+        types.add(MediaType.application("x-til"));
+        types.add(MediaType.application("x-ers"));
+        types.add(MediaType.application("x-l1b"));
+        types.add(MediaType.image("fit"));
+        types.add(MediaType.application("x-grib"));
+        types.add(MediaType.image("jp2"));
+        types.add(MediaType.application("x-rmf"));
+        types.add(MediaType.application("x-wcs"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-msgn"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-rst"));
+        types.add(MediaType.application("x-ingr"));
+        types.add(MediaType.application("x-gsag"));
+        types.add(MediaType.application("x-gsbg"));
+        types.add(MediaType.application("x-gs7bg"));
+        types.add(MediaType.application("x-cosar"));
+        types.add(MediaType.application("x-tsx"));
+        types.add(MediaType.application("x-coasp"));
+        types.add(MediaType.application("x-r"));
+        types.add(MediaType.application("x-map"));
+        types.add(MediaType.application("x-pnm"));
+        types.add(MediaType.application("x-doq1"));
+        types.add(MediaType.application("x-doq2"));
+        types.add(MediaType.application("x-envi"));
+        types.add(MediaType.application("x-envi-hdr"));
+        types.add(MediaType.application("x-generic-bin"));
+        types.add(MediaType.application("x-p-aux"));
+        types.add(MediaType.image("x-mff"));
+        types.add(MediaType.image("x-mff2"));
+        types.add(MediaType.image("x-fujibas"));
+        types.add(MediaType.application("x-gsc"));
+        types.add(MediaType.application("x-fast"));
+        types.add(MediaType.application("x-bt"));
+        types.add(MediaType.application("x-lan"));
+        types.add(MediaType.application("x-cpg"));
+        types.add(MediaType.image("ida"));
+        types.add(MediaType.application("x-ndf"));
+        types.add(MediaType.image("eir"));
+        types.add(MediaType.application("x-dipex"));
+        types.add(MediaType.application("x-lcp"));
+        types.add(MediaType.application("x-gtx"));
+        types.add(MediaType.application("x-los-las"));
+        types.add(MediaType.application("x-ntv2"));
+        types.add(MediaType.application("x-ctable2"));
+        types.add(MediaType.application("x-ace2"));
+        types.add(MediaType.application("x-snodas"));
+        types.add(MediaType.application("x-kro"));
+        types.add(MediaType.image("arg"));
+        types.add(MediaType.application("x-rik"));
+        types.add(MediaType.application("x-usgs-dem"));
+        types.add(MediaType.application("x-gxf"));
+        types.add(MediaType.application("x-dods"));
+        types.add(MediaType.application("x-http"));
+        types.add(MediaType.application("x-bag"));
+        types.add(MediaType.application("x-hdf"));
+        types.add(MediaType.image("x-hdf5-image"));
+        types.add(MediaType.application("x-nwt-grd"));
+        types.add(MediaType.application("x-nwt-grc"));
+        types.add(MediaType.image("adrg"));
+        types.add(MediaType.image("x-srp"));
+        types.add(MediaType.application("x-blx"));
+        types.add(MediaType.application("x-rasterlite"));
+        types.add(MediaType.application("x-epsilon"));
+        types.add(MediaType.application("x-sdat"));
+        types.add(MediaType.application("x-kml"));
+        types.add(MediaType.application("x-xyz"));
+        types.add(MediaType.application("x-geo-pdf"));
+        types.add(MediaType.image("x-ozi"));
+        types.add(MediaType.application("x-ctg"));
+        types.add(MediaType.application("x-e00-grid"));
+        types.add(MediaType.application("x-zmap"));
+        types.add(MediaType.application("x-webp"));
+        types.add(MediaType.application("x-ngs-geoid"));
+        types.add(MediaType.application("x-mbtiles"));
+        types.add(MediaType.application("x-ppi"));
+        types.add(MediaType.application("x-cappi"));
+        return types;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        if (!ExternalParser.check("gdalinfo")) {
+            return;
+        }
+
+        // first set up and run GDAL
+        // process the command
+        TemporaryResources tmp = new TemporaryResources();
+        TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+        String runCommand = processCommand(tis);
+        String output = execCommand(new String[]{runCommand});
+
+        // now extract the actual metadata params
+        // from the GDAL output in the content stream
+        // to do this, we need to literally process the output
+        // from the invoked command b/c we can't read metadata and
+        // output text from the handler in ExternalParser
+        // at the same time, so for now, we can't use the
+        // ExternalParser to do this and I've had to bring some of
+        // that functionality directly into this class
+        // TODO: investigate a way to do both using ExternalParser
+
+        extractMetFromOutput(output, metadata);
+        applyPatternsToOutput(output, metadata, getPatterns());
+
+        // make the content handler and provide output there
+        // now that we have metadata
+        processOutput(handler, metadata, output);
+    }
+
+    private Map<Pattern, String> getPatterns() {
+        Map<Pattern, String> patterns = new HashMap<Pattern, String>();
+        this.addPatternWithColon("Driver", patterns);
+        this.addPatternWithColon("Files", patterns);
+        this.addPatternWithIs("Size", patterns);
+        this.addPatternWithIs("Coordinate System", patterns);
+        this.addBoundingBoxPattern("Upper Left", patterns);
+        this.addBoundingBoxPattern("Lower Left", patterns);
+        this.addBoundingBoxPattern("Upper Right", patterns);
+        this.addBoundingBoxPattern("Lower Right", patterns);
+        return patterns;
+    }
+
+    private void addPatternWithColon(String name, Map<Pattern, String> patterns) {
+        patterns.put(
+                Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"),
+                name);
+    }
+
+    private void addPatternWithIs(String name, Map<Pattern, String> patterns) {
+        patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"),
+                name);
+    }
+
+    private void addBoundingBoxPattern(String name,
+                                       Map<Pattern, String> patterns) {
+        patterns.put(
+                Pattern.compile(name
+                        + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"),
+                name);
+    }
+
+    private void extractMetFromOutput(String output, Metadata met) {
+        Scanner scanner = new Scanner(output);
+        String currentKey = null;
+        String[] headings = {"Subdatasets", "Corner Coordinates"};
+        StringBuilder metVal = new StringBuilder();
+        while (scanner.hasNextLine()) {
+            String line = scanner.nextLine();
+            if (line.contains("=") || hasHeadings(line, headings)) {
+                if (currentKey != null) {
+                    // time to flush this key and met val
+                    met.add(currentKey, metVal.toString());
+                }
+                metVal.setLength(0);
+
+                String[] lineToks = line.split("=");
+                currentKey = lineToks[0].trim();
+                if (lineToks.length == 2) {
+                    metVal.append(lineToks[1]);
+                } else {
+                    metVal.append("");
+                }
+            } else {
+                metVal.append(line);
+            }
+
+        }
+    }
+
+    private boolean hasHeadings(String line, String[] headings) {
+        if (headings != null && headings.length > 0) {
+            for (String heading : headings) {
+                if (line.contains(heading)) {
+                    return true;
+                }
+            }
+            return false;
+        } else return false;
+    }
+
+    private void applyPatternsToOutput(String output, Metadata metadata,
+                                       Map<Pattern, String> metadataPatterns) {
+        Scanner scanner = new Scanner(output);
+        while (scanner.hasNextLine()) {
+            String line = scanner.nextLine();
+            for (Pattern p : metadataPatterns.keySet()) {
+                Matcher m = p.matcher(line);
+                if (m.find()) {
+                    if (metadataPatterns.get(p) != null
+                            && !metadataPatterns.get(p).equals("")) {
+                        metadata.add(metadataPatterns.get(p), m.group(1));
+                    } else {
+                        metadata.add(m.group(1), m.group(2));
+                    }
+                }
+            }
+        }
+
+    }
+
+    private String execCommand(String[] cmd) throws IOException {
+        // Execute
+        Process process;
+        String output = null;
+        if (cmd.length == 1) {
+            process = Runtime.getRuntime().exec(cmd[0]);
+        } else {
+            process = Runtime.getRuntime().exec(cmd);
+        }
+
+        try {
+            InputStream out = process.getInputStream();
+
+            try {
+                output = extractOutput(out);
+            } catch (Exception e) {
+                e.printStackTrace();
+                output = "";
+            }
+
+        } finally {
+            try {
+                process.waitFor();
+            } catch (InterruptedException ignore) {
+            }
+        }
+        return output;
+
+    }
+
+    private String extractOutput(InputStream stream) throws SAXException,
+            IOException {
+        StringBuilder sb = new StringBuilder();
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                sb.append(buffer, 0, n);
+            }
+        }
+        return sb.toString();
+    }
+
+    private void processOutput(ContentHandler handler, Metadata metadata,
+                               String output) throws SAXException, IOException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8));
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            xhtml.startDocument();
+            xhtml.startElement("p");
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                xhtml.characters(buffer, 0, n);
+            }
+            xhtml.endElement("p");
+
+        } finally {
+            xhtml.endDocument();
+        }
+
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+    private static final long serialVersionUID = -2241391757440215491L;
+    private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+    private static final MediaType MEDIA_TYPE = 
+                                    MediaType.application("geotopic");
+    private static final Set<MediaType> SUPPORTED_TYPES = 
+                                    Collections.singleton(MEDIA_TYPE);
+    
+    private GeoParserConfig config = new GeoParserConfig();
+
+    private boolean initialized;
+    private URL modelUrl;
+    private NameEntityExtractor extractor;
+    private boolean available;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Initializes this parser
+     * @param modelUrl the URL to NER model
+     */
+    public void initialize(URL modelUrl) {
+        if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+            // Previously initialized for the same URL, no initialization needed
+            return;
+        }
+        
+        this.modelUrl = modelUrl;
+        
+        // Check if the NER model is available, and if the
+        //  lucene-geo-gazetteer is available
+        this.available = modelUrl != null && ExternalParser.check(
+                new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+        if (this.available) {
+            try {
+                this.extractor = new NameEntityExtractor(modelUrl);
+            } catch (Exception e) {
+                LOG.warning("Named Entity Extractor setup failed: " + e);
+                this.available = false;
+            }
+        }
+        initialized = true;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        /*----------------configure this parser by ParseContext Object---------------------*/
+
+        this.config = context.get(GeoParserConfig.class, config);
+        initialize(this.config.getNerModelUrl());
+        if (!isAvailable()) {
+            return;
+        }
+
+        /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
+        extractor.getAllNameEntitiesfromInput(stream);
+        extractor.getBestNameEntity();
+        ArrayList<String> locationNameEntities = extractor.locationNameEntities;
+        String bestner = extractor.bestNameEntity;
+
+        /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
+        HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+        /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
+        GeoTag geotag = new GeoTag();
+        geotag.toGeoTag(resolvedGeonames, bestner);
+
+        /* add resolved entities in metadata */
+
+        metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+        metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+        metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+        for (int i = 0; i < geotag.alternatives.size(); ++i) {
+            GeoTag alter = (GeoTag) geotag.alternatives.get(i);
+            metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+            metadata.add("Optional_LONGITUDE" + (i + 1),
+                         alter.Geographic_LONGTITUDE);
+            metadata.add("Optional_LATITUDE" + (i + 1),
+                         alter.Geographic_LATITUDE);
+        }
+    }
+
+    public HashMap<String, ArrayList<String>> searchGeoNames(
+            ArrayList<String> locationNameEntities) throws ExecuteException,
+            IOException {
+        CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        cmdLine.addArgument("-s");
+        for (String name : locationNameEntities) {
+            cmdLine.addArgument(name);
+        }
+
+        LOG.fine("Executing: " + cmdLine);
+        DefaultExecutor exec = new DefaultExecutor();
+        exec.setExitValue(0);
+        ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+        exec.setWatchdog(watchdog);
+        PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+        exec.setStreamHandler(streamHandler);
+        int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment());
+        String outputJson = outputStream.toString("UTF-8");
+        JSONArray json = (JSONArray) JSONValue.parse(outputJson);
+
+        HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
+        for (int i = 0; i < json.size(); i++) {
+            JSONObject obj = (JSONObject) json.get(i);
+            for (Object key : obj.keySet()) {
+                String theKey = (String) key;
+                JSONArray vals = (JSONArray) obj.get(theKey);
+                ArrayList<String> stringVals = new ArrayList<String>(
+                        vals.size());
+                for (int j = 0; j < vals.size(); j++) {
+                    String val = (String) vals.get(j);
+                    stringVals.add(val);
+                }
+
+                returnHash.put(theKey, stringVals);
+            }
+        }
+
+        return returnHash;
+    }
+
+    public boolean isAvailable() {
+        if (!initialized) {
+            initialize(config.getNerModelUrl());
+        }
+        return this.available;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.MalformedURLException;
+import java.net.URL;
+
+public class GeoParserConfig implements Serializable {
+    private static final long serialVersionUID = -3167692634278575818L;
+    private URL nerModelUrl = null;
+
+    public GeoParserConfig() {
+        this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
+    }
+
+    public void setNERModelPath(String path) {
+        if (path == null)
+            return;
+        File file = new File(path);
+        if (file.isDirectory() || !file.exists()) {
+            return;
+        }
+        try {
+            this.nerModelUrl = file.toURI().toURL();
+        } catch (MalformedURLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void setNerModelUrl(URL url) {
+        this.nerModelUrl = url;
+    }
+    public URL getNerModelUrl() {
+        return nerModelUrl;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+	String Geographic_NAME;
+	String Geographic_LONGTITUDE;
+	String Geographic_LATITUDE;
+	ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+
+	public void setMain(String name, String longitude, String latitude) {
+		Geographic_NAME = name;
+		Geographic_LONGTITUDE = longitude;
+		Geographic_LATITUDE = latitude;
+	}
+
+	public void addAlternative(GeoTag geotag) {
+		alternatives.add(geotag);
+	}
+
+	/*
+	 * Store resolved geoName entities in a GeoTag
+	 * 
+	 * @param resolvedGeonames resolved entities
+	 * 
+	 * @param bestNER best name entity among all the extracted entities for the
+	 * input stream
+	 */
+	public void toGeoTag(HashMap<String, ArrayList<String>> resolvedGeonames,
+			String bestNER) {
+
+		for (String key : resolvedGeonames.keySet()) {
+			ArrayList<String> cur = resolvedGeonames.get(key);
+			if (key.equals(bestNER)) {
+				this.Geographic_NAME = cur.get(0);
+				this.Geographic_LONGTITUDE = cur.get(1);
+				this.Geographic_LATITUDE = cur.get(2);
+			} else {
+				GeoTag alter = new GeoTag();
+				alter.Geographic_NAME = cur.get(0);
+				alter.Geographic_LONGTITUDE = cur.get(1);
+				alter.Geographic_LATITUDE = cur.get(2);
+				this.addAlternative(alter);
+			}
+		}
+	}
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.commons.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class NameEntityExtractor {
+    ArrayList<String> locationNameEntities;
+    String bestNameEntity;
+    private HashMap<String, Integer> tf;
+    private final NameFinderME nameFinder;
+
+    public NameEntityExtractor(URL modelUrl) throws IOException {
+        this.locationNameEntities = new ArrayList<String>();
+        this.bestNameEntity = null;
+        TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+        this.nameFinder = new NameFinderME(model);
+        this.tf = new HashMap<String, Integer>();
+    }
+
+    /*
+     * Use OpenNLP to extract location names that's appearing in the steam.
+     * OpenNLP's default Name Finder accuracy is not very good, please refer to
+     * its documentation.
+     * 
+     * @param stream stream that passed from this.parse()
+     */
+    public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
+        String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+        Span nameE[];
+        
+        //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+        synchronized (nameFinder) {
+            nameE = nameFinder.find(in);
+            //the same name finder is reused, so clear adaptive data
+            nameFinder.clearAdaptiveData();
+        }
+
+        String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+        spanNames = spanNames.substring(1, spanNames.length() - 1);
+        String[] tmp = spanNames.split(",");
+
+        for (String name : tmp) {
+            name = name.trim();
+            this.locationNameEntities.add(name);
+        }
+
+
+    }
+
+    /*
+     * Get the best location entity extracted from the input stream. Simply
+     * return the most frequent entity, If there several highest frequent
+     * entity, pick one randomly. May not be the optimal solution, but works.
+     * 
+     * @param locationNameEntities OpenNLP name finder's results, stored in
+     * ArrayList
+     */
+    public void getBestNameEntity() {
+        if (this.locationNameEntities.size() == 0)
+            return;
+
+        for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+            if (tf.containsKey(this.locationNameEntities.get(i)))
+                tf.put(this.locationNameEntities.get(i),
+                        tf.get(this.locationNameEntities.get(i)) + 1);
+            else
+                tf.put(this.locationNameEntities.get(i), 1);
+        }
+        int max = 0;
+        List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+                tf.entrySet());
+        Collections.shuffle(list);
+        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+            public int compare(Map.Entry<String, Integer> o1,
+                    Map.Entry<String, Integer> o2) {
+                // Descending Order
+                return o2.getValue().compareTo(o1.getValue());
+            }
+        });
+
+        this.locationNameEntities.clear();// update so that they are in
+                                          // descending order
+        for (Map.Entry<String, Integer> entry : list) {
+            this.locationNameEntities.add(entry.getKey());
+            if (entry.getValue() > max) {
+                max = entry.getValue();
+                this.bestNameEntity = entry.getKey();
+            }
+        }
+    }
+}