You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [22/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-m...
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class used to extract biomedical information while parsing.
+ *
+ * <p>
+ * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
+ * that is a natural language processing system for extraction of information
+ * from electronic medical record clinical free-text.
+ * </p>
+ */
+public class CTAKESContentHandler extends ContentHandlerDecorator {
+ // Prefix used for metadata including cTAKES annotations
+ public static String CTAKES_META_PREFIX = "ctakes:";
+
+ // Configuration object for CTAKESContentHandler
+ private CTAKESConfig config = null;
+
+ // StringBuilder object used to build the clinical free-text for cTAKES
+ private StringBuilder sb = null;
+
+ // Metadata object used for cTAKES annotations
+ private Metadata metadata = null;
+
+ // UIMA Analysis Engine
+ private AnalysisEngine ae = null;
+
+ // JCas object for working with the CAS (Common Analysis System)
+ private JCas jcas = null;
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
+ * ContentHandler} and Metadata objects.
+ *
+ * @param handler
+ * the {@see ContentHandler} object to be decorated.
+ * @param metadata
+ * the {@see Metadata} object that will be populated using
+ * biomedical information extracted by cTAKES.
+ * @param config
+ * the {@see CTAKESConfig} object used to configure the handler.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
+ CTAKESConfig config) {
+ super(handler);
+ this.metadata = metadata;
+ this.config = config;
+ this.sb = new StringBuilder();
+ }
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
+ * ContentHandler} and Metadata objects.
+ *
+ * @param handler
+ * the {@see ContentHandler} object to be decorated.
+ * @param metadata
+ * the {@see Metadata} object that will be populated using
+ * biomedical information extracted by cTAKES.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+ this(handler, metadata, new CTAKESConfig());
+ }
+
+ /**
+ * Default constructor.
+ */
+ public CTAKESContentHandler() {
+ this(new DefaultHandler(), new Metadata());
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (config.isText()) {
+ sb.append(ch, start, length);
+ }
+ super.characters(ch, start, length);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ try {
+ // create an Analysis Engine
+ if (ae == null) {
+ ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), config.getUMLSPass());
+ }
+
+ // create a JCas, given an AE
+ if (jcas == null) {
+ jcas = CTAKESUtils.getJCas(ae);
+ }
+
+ // get metadata to process
+ StringBuilder metaText = new StringBuilder();
+ String[] metadataToProcess = config.getMetadata();
+ if (metadataToProcess != null) {
+ for (String name : config.getMetadata()) {
+ for (String value : metadata.getValues(name)) {
+ metaText.append(value);
+ metaText.append(System.lineSeparator());
+ }
+ }
+ }
+
+ // analyze text
+ jcas.setDocumentText(metaText.toString() + sb.toString());
+ ae.process(jcas);
+
+ // add annotations to metadata
+ metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString());
+ CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps();
+ Collection<IdentifiedAnnotation> collection = JCasUtil.select(jcas, IdentifiedAnnotation.class);
+ Iterator<IdentifiedAnnotation> iterator = collection.iterator();
+ while (iterator.hasNext()) {
+ IdentifiedAnnotation annotation = iterator.next();
+ StringBuilder annotationBuilder = new StringBuilder();
+ annotationBuilder.append(annotation.getCoveredText());
+ if (annotationPros != null) {
+ for (CTAKESAnnotationProperty property : annotationPros) {
+ annotationBuilder.append(config.getSeparatorChar());
+ annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, property));
+ }
+ }
+ metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), annotationBuilder.toString());
+ }
+
+ if (config.isSerialize()) {
+ // serialize data
+ CTAKESUtils.serialize(jcas, config.getSerializerType(), config.isPrettyPrint(), config.getOutputStream());
+ }
+ } catch (Exception e) {
+ throw new SAXException(e.getMessage());
+ } finally {
+ CTAKESUtils.resetCAS(jcas);
+ }
+ }
+
+ /**
+ * Returns metadata that includes cTAKES annotations.
+ *
+ * @return {@Metadata} object that includes cTAKES annotations.
+ */
+ public Metadata getMetadata() {
+ return metadata;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates a {@see Parser} and leverages on
+ * {@see CTAKESContentHandler} to extract biomedical information from
+ * clinical text using Apache cTAKES.
+ * <p>It is normally called by supplying an instance to
+ * {@link AutoDetectParser}, such as:
+ * <code>AutoDetectParser parser = new AutoDetectParser(new CTAKESParser());</code>
+ * <p>It can also be used by giving a Tika Config file similar to:
+ * <code>
+ * <properties>
+ * <parsers>
+ * <parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+ * <parser class="org.apache.tika.parser.DefaultParser"/>
+ * </parser>
+ * </parsers>
+ * </properties>
+ * </code>
+ * <p>Because this is a Parser Decorator, and not a normal Parser in
+ * it's own right, it isn't normally selected via the Parser Service Loader.
+ */
+public class CTAKESParser extends ParserDecorator {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -2313482748027097961L;
+
+ /**
+ * Wraps the default Parser
+ */
+ public CTAKESParser() {
+ this(TikaConfig.getDefaultConfig());
+ }
+ /**
+ * Wraps the default Parser for this Config
+ */
+ public CTAKESParser(TikaConfig config) {
+ this(config.getParser());
+ }
+ /**
+ * Wraps the specified Parser
+ */
+ public CTAKESParser(Parser parser) {
+ super(parser);
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ CTAKESConfig config = context.get(CTAKESConfig.class,
+ new CTAKESConfig());
+ CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler,
+ metadata, config);
+ super.parse(stream, ctakesHandler, metadata, context);
+ }
+
+ //@Override
+ public String getDecorationName() {
+ return "CTakes";
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ *
+ * A CAS serializer writes a CAS in the given format.
+ */
+public enum CTAKESSerializer {
+ XCAS(XCASSerializer.class.getName()),
+ XMI(XmiCasSerializer.class.getName()),
+ XML(XmlCasSerializer.class.getName());
+
+ private final String className;
+
+ private CTAKESSerializer(String className) {
+ this.className = className;
+ }
+
+ public String getClassName() {
+ return className;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ *
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
+ * toolkit.
+ * </p>
+ */
+public class CTAKESUtils {
+ // UMLS username property
+ private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+ // UMLS password property
+ private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+ /**
+ * Returns a new UIMA Analysis Engine (AE). This method ensures that only
+ * one instance of an AE is created.
+ *
+ * <p>
+ * An Analysis Engine is a component responsible for analyzing unstructured
+ * information, discovering and representing semantic content. Unstructured
+ * information includes, but is not restricted to, text documents.
+ * </p>
+ *
+ * @param aeDescriptor
+ * pathname for XML file including an AnalysisEngineDescription
+ * that contains all of the information needed to instantiate and
+ * use an AnalysisEngine.
+ * @param umlsUser
+ * UMLS username for NLM database
+ * @param umlsPass
+ * UMLS password for NLM database
+ * @return an Analysis Engine for analyzing unstructured information.
+ * @throws IOException
+ * if any I/O error occurs.
+ * @throws InvalidXMLException
+ * if the input XML is not valid or does not specify a valid
+ * ResourceSpecifier.
+ * @throws ResourceInitializationException
+ * if a failure occurred during production of the resource.
+ * @throws URISyntaxException
+ * if URL of the resource is not formatted strictly according to
+ * to RFC2396 and cannot be converted to a URI.
+ */
+ public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+ String umlsUser, String umlsPass) throws IOException,
+ InvalidXMLException, ResourceInitializationException,
+ URISyntaxException {
+ // UMLS user ID and password.
+ String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor)
+ .toURI().getPath();
+
+ // get Resource Specifier from XML
+ XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
+ ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+ .parseResourceSpecifier(aeIputSource);
+
+ // UMLS user ID and password
+ if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null)
+ && (!umlsPass.isEmpty())) {
+ /*
+ * It is highly recommended that you change UMLS credentials in the
+ * XML configuration file instead of giving user and password using
+ * CTAKESConfig.
+ */
+ System.setProperty(CTAKES_UMLS_USER, umlsUser);
+ System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+ }
+
+ // create AE
+ AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+
+ return ae;
+ }
+
+ /**
+ * Returns a new JCas () appropriate for the given Analysis Engine. This
+ * method ensures that only one instance of a JCas is created. A Jcas is a
+ * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+ * API.
+ *
+ * <p>
+ * Important: It is highly recommended that you reuse CAS objects rather
+ * than creating new CAS objects prior to each analysis. This is because CAS
+ * objects may be expensive to create and may consume a significant amount
+ * of memory.
+ * </p>
+ *
+ * @param ae
+ * AnalysisEngine used to create an appropriate JCas object.
+ * @return a JCas object appropriate for the given AnalysisEngine.
+ * @throws ResourceInitializationException
+ * if a CAS could not be created because this AnalysisEngine's
+ * CAS metadata (type system, type priorities, or FS indexes)
+ * are invalid.
+ */
+ public static JCas getJCas(AnalysisEngine ae)
+ throws ResourceInitializationException {
+ JCas jcas = ae.newJCas();
+
+ return jcas;
+ }
+
+ /**
+ * Serializes a CAS in the given format.
+ *
+ * @param jcas
+ * CAS (Common Analysis System) to be serialized.
+ * @param type
+ * type of cTAKES (UIMA) serializer used to write CAS.
+ * @param prettyPrint
+ * {@code true} to do pretty printing of output.
+ * @param stream
+ * {@see OutputStream} object used to print out information
+ * extracted by using cTAKES.
+ * @throws SAXException
+ * if there was a SAX exception.
+ * @throws IOException
+ * if any I/O error occurs.
+ */
+ public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint,
+ OutputStream stream) throws SAXException, IOException {
+ if (type == CTAKESSerializer.XCAS) {
+ XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
+ } else if (type == CTAKESSerializer.XMI) {
+ XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+ stream, prettyPrint, new XmiSerializationSharedData());
+ } else {
+ XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+ stream);
+ }
+ }
+
+ /**
+ * Returns the annotation value based on the given annotation type.
+ *
+ * @param annotation
+ * {@see IdentifiedAnnotation} object.
+ * @param property
+ * {@see CTAKESAnnotationProperty} enum used to identify the
+ * annotation type.
+ * @return the annotation value.
+ */
+ public static String getAnnotationProperty(IdentifiedAnnotation annotation,
+ CTAKESAnnotationProperty property) {
+ String value = null;
+ if (property == CTAKESAnnotationProperty.BEGIN) {
+ value = Integer.toString(annotation.getBegin());
+ } else if (property == CTAKESAnnotationProperty.END) {
+ value = Integer.toString(annotation.getEnd());
+ } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+ value = Boolean.toString(annotation.getConditional());
+ } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+ value = Float.toString(annotation.getConfidence());
+ } else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+ value = Integer.toString(annotation.getDiscoveryTechnique());
+ } else if (property == CTAKESAnnotationProperty.GENERIC) {
+ value = Boolean.toString(annotation.getGeneric());
+ } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+ value = Integer.toString(annotation.getHistoryOf());
+ } else if (property == CTAKESAnnotationProperty.ID) {
+ value = Integer.toString(annotation.getId());
+ } else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+ FSArray mentions = annotation.getOntologyConceptArr();
+ StringBuilder sb = new StringBuilder();
+ if (mentions != null) {
+ for (int i = 0; i < mentions.size(); i++) {
+ if (mentions.get(i) instanceof UmlsConcept) {
+ UmlsConcept concept = (UmlsConcept) mentions.get(i);
+ sb.append(concept.getCui());
+ if (i < mentions.size() - 1) {
+ sb.append(",");
+ }
+ }
+ }
+ }
+ value = sb.toString();
+ } else if (property == CTAKESAnnotationProperty.POLARITY) {
+ value = Integer.toString(annotation.getPolarity());
+ }
+ return value;
+ }
+
+ /**
+ * Resets cTAKES objects, if created. This method ensures that new cTAKES
+ * objects (a.k.a., Analysis Engine and JCas) will be created if getters of
+ * this class are called.
+ *
+ * @param ae UIMA Analysis Engine
+ * @param jcas JCas object
+ */
+ public static void reset(AnalysisEngine ae, JCas jcas) {
+ // Analysis Engine
+ resetAE(ae);
+
+ // JCas
+ resetCAS(jcas);
+ jcas = null;
+ }
+
+ /**
+ * Resets the CAS (Common Analysis System), emptying it of all content.
+ *
+ * @param jcas JCas object
+ */
+ public static void resetCAS(JCas jcas) {
+ if (jcas != null) {
+ jcas.reset();
+ }
+ }
+
+ /**
+ * Resets the AE (AnalysisEngine), releasing all resources held by the
+ * current AE.
+ *
+ * @param ae UIMA Analysis Engine
+ */
+ public static void resetAE(AnalysisEngine ae) {
+ if (ae != null) {
+ ae.destroy();
+ ae = null;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.util.Stack;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DIFContentHandler extends DefaultHandler {
+
+ private static final char[] NEWLINE = new char[] { '\n' };
+ private static final char[] TABSPACE = new char[] { '\t' };
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ private Stack<String> treeStack;
+ private Stack<String> dataStack;
+ private final ContentHandler delegate;
+ private boolean isLeaf;
+ private Metadata metadata;
+
+ public DIFContentHandler(ContentHandler delegate, Metadata metadata) {
+ this.delegate = delegate;
+ this.isLeaf = false;
+ this.metadata = metadata;
+ this.treeStack = new Stack<String>();
+ this.dataStack = new Stack<String>();
+ }
+
+ @Override
+ public void setDocumentLocator(org.xml.sax.Locator locator) {
+ delegate.setDocumentLocator(locator);
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ String value = (new String(ch, start, length)).toString();
+ this.dataStack.push(value);
+
+ if (this.treeStack.peek().equals("Entry_Title")) {
+ this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES);
+ String title = "Title: ";
+ title = title + value;
+ this.delegate.characters(title.toCharArray(), 0, title.length());
+ this.delegate.endElement("", "h3", "h3");
+ }
+ if (this.treeStack.peek().equals("Southernmost_Latitude")
+ || this.treeStack.peek().equals("Northernmost_Latitude")
+ || this.treeStack.peek().equals("Westernmost_Longitude")
+ || this.treeStack.peek().equals("Easternmost_Longitude")) {
+ this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.startElement("", "tr", "tr", EMPTY_ATTRIBUTES);
+ this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES);
+ String key = this.treeStack.peek() + " : ";
+ this.delegate.characters(key.toCharArray(), 0, key.length());
+ this.delegate.endElement("", "td", "td");
+ this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES);
+ this.delegate.characters(value.toCharArray(), 0, value.length());
+ this.delegate.endElement("", "td", "td");
+ this.delegate.endElement("", "tr", "tr");
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ delegate.ignorableWhitespace(ch, start, length);
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+ this.isLeaf = true;
+ if (localName.equals("Spatial_Coverage")) {
+ this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES);
+ String value = "Geographic Data: ";
+ this.delegate.characters(value.toCharArray(), 0, value.length());
+ this.delegate.endElement("", "h3", "h3");
+ this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.startElement("", "table", "table", EMPTY_ATTRIBUTES);
+ }
+ this.treeStack.push(localName);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (localName.equals("Spatial_Coverage")) {
+ this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+ this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+ this.delegate.endElement("", "table", "table");
+ }
+ if (this.isLeaf) {
+ Stack<String> tempStack = (Stack<String>) this.treeStack.clone();
+ String key = "";
+ while (!tempStack.isEmpty()) {
+ if (key.length() == 0) {
+ key = tempStack.pop();
+ } else {
+ key = tempStack.pop() + "-" + key;
+ }
+ }
+ String value = this.dataStack.peek();
+ this.metadata.add(key, value);
+ this.isLeaf = false;
+ }
+ this.treeStack.pop();
+ this.dataStack.pop();
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ delegate.startDocument();
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ delegate.endDocument();
+ }
+
+ @Override
+ public String toString() {
+ return delegate.toString();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DIFParser extends AbstractParser {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 971505521275777826L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.application("dif+xml"))));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // TODO Auto-generated method stub
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ // TODO Auto-generated method stub
+ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+ metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
+ try {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))));
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+
+ }
+
+ protected ContentHandler getContentHandler(ContentHandler handler,
+ Metadata metadata, ParseContext context) {
+
+ return new DIFContentHandler(handler, metadata);
+
+ }
+
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.apache.tika.parser.envi;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class EnviHeaderParser extends AbstractParser {
+
+ private static final long serialVersionUID = -1479368523072408091L;
+
+ public static final String ENVI_MIME_TYPE = "application/envi.hdr";
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.application("envi.hdr"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ // Only outputting the MIME type as metadata
+ metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
+
+ // The following code was taken from the TXTParser
+ // Automatically detect the character encoding
+
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata)) {
+ Charset charset = reader.getCharset();
+ MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+ metadata);
+
+ xhtml.startDocument();
+
+ // text contents of the xhtml
+ String line;
+ while ((line = reader.readLine()) != null) {
+ xhtml.startElement("p");
+ xhtml.characters(line);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.gdal;
+
+//JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;
+
+//Tika imports
+//SAX imports
+
+/**
+ * Wraps execution of the <a href="http//gdal.org/">Geospatial Data Abstraction
+ * Library (GDAL)</a> <code>gdalinfo</code> tool used to extract geospatial
+ * information out of hundreds of geo file formats.
+ * <p/>
+ * The parser requires the installation of GDAL and for <code>gdalinfo</code> to
+ * be located on the path.
+ * <p/>
+ * Basic information (Size, Coordinate System, Bounding Box, Driver, and
+ * resource info) are extracted as metadata, and the remaining metadata patterns
+ * are extracted and added.
+ * <p/>
+ * The output of the command is available from the provided
+ * {@link ContentHandler} in the
+ * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method.
+ */
+public class GDALParser extends AbstractParser {
+
+ private static final long serialVersionUID = -3869130527323941401L;
+
+ private String command;
+
+ public GDALParser() {
+ setCommand("gdalinfo ${INPUT}");
+ }
+
+ public void setCommand(String command) {
+ this.command = command;
+ }
+
+ public String getCommand() {
+ return this.command;
+ }
+
+ public String processCommand(InputStream stream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ String pCommand = this.command;
+ try {
+ if (this.command.contains(INPUT_FILE_TOKEN)) {
+ pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile()
+ .getPath());
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return pCommand;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ Set<MediaType> types = new HashSet<MediaType>();
+ types.add(MediaType.application("x-netcdf"));
+ types.add(MediaType.application("vrt"));
+ types.add(MediaType.image("geotiff"));
+ types.add(MediaType.image("nitf"));
+ types.add(MediaType.application("x-rpf-toc"));
+ types.add(MediaType.application("x-ecrg-toc"));
+ types.add(MediaType.image("hfa"));
+ types.add(MediaType.image("sar-ceos"));
+ types.add(MediaType.image("ceos"));
+ types.add(MediaType.application("jaxa-pal-sar"));
+ types.add(MediaType.application("gff"));
+ types.add(MediaType.application("elas"));
+ types.add(MediaType.application("aig"));
+ types.add(MediaType.application("aaigrid"));
+ types.add(MediaType.application("grass-ascii-grid"));
+ types.add(MediaType.application("sdts-raster"));
+ types.add(MediaType.application("dted"));
+ types.add(MediaType.image("png"));
+ types.add(MediaType.image("jpeg"));
+ types.add(MediaType.image("raster"));
+ types.add(MediaType.application("jdem"));
+ types.add(MediaType.image("gif"));
+ types.add(MediaType.image("big-gif"));
+ types.add(MediaType.image("envisat"));
+ types.add(MediaType.image("fits"));
+ types.add(MediaType.application("fits"));
+ types.add(MediaType.image("bsb"));
+ types.add(MediaType.application("xpm"));
+ types.add(MediaType.image("bmp"));
+ types.add(MediaType.image("x-dimap"));
+ types.add(MediaType.image("x-airsar"));
+ types.add(MediaType.application("x-rs2"));
+ types.add(MediaType.application("x-pcidsk"));
+ types.add(MediaType.application("pcisdk"));
+ types.add(MediaType.image("x-pcraster"));
+ types.add(MediaType.image("ilwis"));
+ types.add(MediaType.image("sgi"));
+ types.add(MediaType.application("x-srtmhgt"));
+ types.add(MediaType.application("leveller"));
+ types.add(MediaType.application("terragen"));
+ types.add(MediaType.application("x-gmt"));
+ types.add(MediaType.application("x-isis3"));
+ types.add(MediaType.application("x-isis2"));
+ types.add(MediaType.application("x-pds"));
+ types.add(MediaType.application("x-til"));
+ types.add(MediaType.application("x-ers"));
+ types.add(MediaType.application("x-l1b"));
+ types.add(MediaType.image("fit"));
+ types.add(MediaType.application("x-grib"));
+ types.add(MediaType.image("jp2"));
+ types.add(MediaType.application("x-rmf"));
+ types.add(MediaType.application("x-wcs"));
+ types.add(MediaType.application("x-wms"));
+ types.add(MediaType.application("x-msgn"));
+ types.add(MediaType.application("x-wms"));
+ types.add(MediaType.application("x-wms"));
+ types.add(MediaType.application("x-rst"));
+ types.add(MediaType.application("x-ingr"));
+ types.add(MediaType.application("x-gsag"));
+ types.add(MediaType.application("x-gsbg"));
+ types.add(MediaType.application("x-gs7bg"));
+ types.add(MediaType.application("x-cosar"));
+ types.add(MediaType.application("x-tsx"));
+ types.add(MediaType.application("x-coasp"));
+ types.add(MediaType.application("x-r"));
+ types.add(MediaType.application("x-map"));
+ types.add(MediaType.application("x-pnm"));
+ types.add(MediaType.application("x-doq1"));
+ types.add(MediaType.application("x-doq2"));
+ types.add(MediaType.application("x-envi"));
+ types.add(MediaType.application("x-envi-hdr"));
+ types.add(MediaType.application("x-generic-bin"));
+ types.add(MediaType.application("x-p-aux"));
+ types.add(MediaType.image("x-mff"));
+ types.add(MediaType.image("x-mff2"));
+ types.add(MediaType.image("x-fujibas"));
+ types.add(MediaType.application("x-gsc"));
+ types.add(MediaType.application("x-fast"));
+ types.add(MediaType.application("x-bt"));
+ types.add(MediaType.application("x-lan"));
+ types.add(MediaType.application("x-cpg"));
+ types.add(MediaType.image("ida"));
+ types.add(MediaType.application("x-ndf"));
+ types.add(MediaType.image("eir"));
+ types.add(MediaType.application("x-dipex"));
+ types.add(MediaType.application("x-lcp"));
+ types.add(MediaType.application("x-gtx"));
+ types.add(MediaType.application("x-los-las"));
+ types.add(MediaType.application("x-ntv2"));
+ types.add(MediaType.application("x-ctable2"));
+ types.add(MediaType.application("x-ace2"));
+ types.add(MediaType.application("x-snodas"));
+ types.add(MediaType.application("x-kro"));
+ types.add(MediaType.image("arg"));
+ types.add(MediaType.application("x-rik"));
+ types.add(MediaType.application("x-usgs-dem"));
+ types.add(MediaType.application("x-gxf"));
+ types.add(MediaType.application("x-dods"));
+ types.add(MediaType.application("x-http"));
+ types.add(MediaType.application("x-bag"));
+ types.add(MediaType.application("x-hdf"));
+ types.add(MediaType.image("x-hdf5-image"));
+ types.add(MediaType.application("x-nwt-grd"));
+ types.add(MediaType.application("x-nwt-grc"));
+ types.add(MediaType.image("adrg"));
+ types.add(MediaType.image("x-srp"));
+ types.add(MediaType.application("x-blx"));
+ types.add(MediaType.application("x-rasterlite"));
+ types.add(MediaType.application("x-epsilon"));
+ types.add(MediaType.application("x-sdat"));
+ types.add(MediaType.application("x-kml"));
+ types.add(MediaType.application("x-xyz"));
+ types.add(MediaType.application("x-geo-pdf"));
+ types.add(MediaType.image("x-ozi"));
+ types.add(MediaType.application("x-ctg"));
+ types.add(MediaType.application("x-e00-grid"));
+ types.add(MediaType.application("x-zmap"));
+ types.add(MediaType.application("x-webp"));
+ types.add(MediaType.application("x-ngs-geoid"));
+ types.add(MediaType.application("x-mbtiles"));
+ types.add(MediaType.application("x-ppi"));
+ types.add(MediaType.application("x-cappi"));
+ return types;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ if (!ExternalParser.check("gdalinfo")) {
+ return;
+ }
+
+ // first set up and run GDAL
+ // process the command
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+ String runCommand = processCommand(tis);
+ String output = execCommand(new String[]{runCommand});
+
+ // now extract the actual metadata params
+ // from the GDAL output in the content stream
+ // to do this, we need to literally process the output
+ // from the invoked command b/c we can't read metadata and
+ // output text from the handler in ExternalParser
+ // at the same time, so for now, we can't use the
+ // ExternalParser to do this and I've had to bring some of
+ // that functionality directly into this class
+ // TODO: investigate a way to do both using ExternalParser
+
+ extractMetFromOutput(output, metadata);
+ applyPatternsToOutput(output, metadata, getPatterns());
+
+ // make the content handler and provide output there
+ // now that we have metadata
+ processOutput(handler, metadata, output);
+ }
+
+ private Map<Pattern, String> getPatterns() {
+ Map<Pattern, String> patterns = new HashMap<Pattern, String>();
+ this.addPatternWithColon("Driver", patterns);
+ this.addPatternWithColon("Files", patterns);
+ this.addPatternWithIs("Size", patterns);
+ this.addPatternWithIs("Coordinate System", patterns);
+ this.addBoundingBoxPattern("Upper Left", patterns);
+ this.addBoundingBoxPattern("Lower Left", patterns);
+ this.addBoundingBoxPattern("Upper Right", patterns);
+ this.addBoundingBoxPattern("Lower Right", patterns);
+ return patterns;
+ }
+
+ private void addPatternWithColon(String name, Map<Pattern, String> patterns) {
+ patterns.put(
+ Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"),
+ name);
+ }
+
+ private void addPatternWithIs(String name, Map<Pattern, String> patterns) {
+ patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"),
+ name);
+ }
+
+ private void addBoundingBoxPattern(String name,
+ Map<Pattern, String> patterns) {
+ patterns.put(
+ Pattern.compile(name
+ + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"),
+ name);
+ }
+
+ private void extractMetFromOutput(String output, Metadata met) {
+ Scanner scanner = new Scanner(output);
+ String currentKey = null;
+ String[] headings = {"Subdatasets", "Corner Coordinates"};
+ StringBuilder metVal = new StringBuilder();
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ if (line.contains("=") || hasHeadings(line, headings)) {
+ if (currentKey != null) {
+ // time to flush this key and met val
+ met.add(currentKey, metVal.toString());
+ }
+ metVal.setLength(0);
+
+ String[] lineToks = line.split("=");
+ currentKey = lineToks[0].trim();
+ if (lineToks.length == 2) {
+ metVal.append(lineToks[1]);
+ } else {
+ metVal.append("");
+ }
+ } else {
+ metVal.append(line);
+ }
+
+ }
+ }
+
+ private boolean hasHeadings(String line, String[] headings) {
+ if (headings != null && headings.length > 0) {
+ for (String heading : headings) {
+ if (line.contains(heading)) {
+ return true;
+ }
+ }
+ return false;
+ } else return false;
+ }
+
+ private void applyPatternsToOutput(String output, Metadata metadata,
+ Map<Pattern, String> metadataPatterns) {
+ Scanner scanner = new Scanner(output);
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ for (Pattern p : metadataPatterns.keySet()) {
+ Matcher m = p.matcher(line);
+ if (m.find()) {
+ if (metadataPatterns.get(p) != null
+ && !metadataPatterns.get(p).equals("")) {
+ metadata.add(metadataPatterns.get(p), m.group(1));
+ } else {
+ metadata.add(m.group(1), m.group(2));
+ }
+ }
+ }
+ }
+
+ }
+
+ private String execCommand(String[] cmd) throws IOException {
+ // Execute
+ Process process;
+ String output = null;
+ if (cmd.length == 1) {
+ process = Runtime.getRuntime().exec(cmd[0]);
+ } else {
+ process = Runtime.getRuntime().exec(cmd);
+ }
+
+ try {
+ InputStream out = process.getInputStream();
+
+ try {
+ output = extractOutput(out);
+ } catch (Exception e) {
+ e.printStackTrace();
+ output = "";
+ }
+
+ } finally {
+ try {
+ process.waitFor();
+ } catch (InterruptedException ignore) {
+ }
+ }
+ return output;
+
+ }
+
+ private String extractOutput(InputStream stream) throws SAXException,
+ IOException {
+ StringBuilder sb = new StringBuilder();
+ try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ sb.append(buffer, 0, n);
+ }
+ }
+ return sb.toString();
+ }
+
+ private void processOutput(ContentHandler handler, Metadata metadata,
+ String output) throws SAXException, IOException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8));
+ try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ xhtml.characters(buffer, 0, n);
+ }
+ xhtml.endElement("p");
+
+ } finally {
+ xhtml.endDocument();
+ }
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+ private static final long serialVersionUID = -2241391757440215491L;
+ private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+ private static final MediaType MEDIA_TYPE =
+ MediaType.application("geotopic");
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MEDIA_TYPE);
+
+ private GeoParserConfig config = new GeoParserConfig();
+
+ private boolean initialized;
+ private URL modelUrl;
+ private NameEntityExtractor extractor;
+ private boolean available;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Initializes this parser
+ * @param modelUrl the URL to NER model
+ */
+ public void initialize(URL modelUrl) {
+ if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+ // Previously initialized for the same URL, no initialization needed
+ return;
+ }
+
+ this.modelUrl = modelUrl;
+
+ // Check if the NER model is available, and if the
+ // lucene-geo-gazetteer is available
+ this.available = modelUrl != null && ExternalParser.check(
+ new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+ if (this.available) {
+ try {
+ this.extractor = new NameEntityExtractor(modelUrl);
+ } catch (Exception e) {
+ LOG.warning("Named Entity Extractor setup failed: " + e);
+ this.available = false;
+ }
+ }
+ initialized = true;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ /*----------------configure this parser by ParseContext Object---------------------*/
+
+ this.config = context.get(GeoParserConfig.class, config);
+ initialize(this.config.getNerModelUrl());
+ if (!isAvailable()) {
+ return;
+ }
+
+ /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
+ extractor.getAllNameEntitiesfromInput(stream);
+ extractor.getBestNameEntity();
+ ArrayList<String> locationNameEntities = extractor.locationNameEntities;
+ String bestner = extractor.bestNameEntity;
+
+ /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
+ HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+ /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
+ GeoTag geotag = new GeoTag();
+ geotag.toGeoTag(resolvedGeonames, bestner);
+
+ /* add resolved entities in metadata */
+
+ metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+ metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+ metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+ for (int i = 0; i < geotag.alternatives.size(); ++i) {
+ GeoTag alter = (GeoTag) geotag.alternatives.get(i);
+ metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+ metadata.add("Optional_LONGITUDE" + (i + 1),
+ alter.Geographic_LONGTITUDE);
+ metadata.add("Optional_LATITUDE" + (i + 1),
+ alter.Geographic_LATITUDE);
+ }
+ }
+
+ public HashMap<String, ArrayList<String>> searchGeoNames(
+ ArrayList<String> locationNameEntities) throws ExecuteException,
+ IOException {
+ CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ cmdLine.addArgument("-s");
+ for (String name : locationNameEntities) {
+ cmdLine.addArgument(name);
+ }
+
+ LOG.fine("Executing: " + cmdLine);
+ DefaultExecutor exec = new DefaultExecutor();
+ exec.setExitValue(0);
+ ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+ exec.setWatchdog(watchdog);
+ PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+ exec.setStreamHandler(streamHandler);
+ int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment());
+ String outputJson = outputStream.toString("UTF-8");
+ JSONArray json = (JSONArray) JSONValue.parse(outputJson);
+
+ HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
+ for (int i = 0; i < json.size(); i++) {
+ JSONObject obj = (JSONObject) json.get(i);
+ for (Object key : obj.keySet()) {
+ String theKey = (String) key;
+ JSONArray vals = (JSONArray) obj.get(theKey);
+ ArrayList<String> stringVals = new ArrayList<String>(
+ vals.size());
+ for (int j = 0; j < vals.size(); j++) {
+ String val = (String) vals.get(j);
+ stringVals.add(val);
+ }
+
+ returnHash.put(theKey, stringVals);
+ }
+ }
+
+ return returnHash;
+ }
+
+ public boolean isAvailable() {
+ if (!initialized) {
+ initialize(config.getNerModelUrl());
+ }
+ return this.available;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.MalformedURLException;
+import java.net.URL;
+
+public class GeoParserConfig implements Serializable {
+ private static final long serialVersionUID = -3167692634278575818L;
+ private URL nerModelUrl = null;
+
+ public GeoParserConfig() {
+ this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
+ }
+
+ public void setNERModelPath(String path) {
+ if (path == null)
+ return;
+ File file = new File(path);
+ if (file.isDirectory() || !file.exists()) {
+ return;
+ }
+ try {
+ this.nerModelUrl = file.toURI().toURL();
+ } catch (MalformedURLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void setNerModelUrl(URL url) {
+ this.nerModelUrl = url;
+ }
+ public URL getNerModelUrl() {
+ return nerModelUrl;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+ String Geographic_NAME;
+ String Geographic_LONGTITUDE;
+ String Geographic_LATITUDE;
+ ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+
+ public void setMain(String name, String longitude, String latitude) {
+ Geographic_NAME = name;
+ Geographic_LONGTITUDE = longitude;
+ Geographic_LATITUDE = latitude;
+ }
+
+ public void addAlternative(GeoTag geotag) {
+ alternatives.add(geotag);
+ }
+
+ /*
+ * Store resolved geoName entities in a GeoTag
+ *
+ * @param resolvedGeonames resolved entities
+ *
+ * @param bestNER best name entity among all the extracted entities for the
+ * input stream
+ */
+ public void toGeoTag(HashMap<String, ArrayList<String>> resolvedGeonames,
+ String bestNER) {
+
+ for (String key : resolvedGeonames.keySet()) {
+ ArrayList<String> cur = resolvedGeonames.get(key);
+ if (key.equals(bestNER)) {
+ this.Geographic_NAME = cur.get(0);
+ this.Geographic_LONGTITUDE = cur.get(1);
+ this.Geographic_LATITUDE = cur.get(2);
+ } else {
+ GeoTag alter = new GeoTag();
+ alter.Geographic_NAME = cur.get(0);
+ alter.Geographic_LONGTITUDE = cur.get(1);
+ alter.Geographic_LATITUDE = cur.get(2);
+ this.addAlternative(alter);
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.commons.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class NameEntityExtractor {
+ ArrayList<String> locationNameEntities;
+ String bestNameEntity;
+ private HashMap<String, Integer> tf;
+ private final NameFinderME nameFinder;
+
+ public NameEntityExtractor(URL modelUrl) throws IOException {
+ this.locationNameEntities = new ArrayList<String>();
+ this.bestNameEntity = null;
+ TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+ this.nameFinder = new NameFinderME(model);
+ this.tf = new HashMap<String, Integer>();
+ }
+
+ /*
+ * Use OpenNLP to extract location names that's appearing in the steam.
+ * OpenNLP's default Name Finder accuracy is not very good, please refer to
+ * its documentation.
+ *
+ * @param stream stream that passed from this.parse()
+ */
+ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
+ String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+ Span nameE[];
+
+ //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+ synchronized (nameFinder) {
+ nameE = nameFinder.find(in);
+ //the same name finder is reused, so clear adaptive data
+ nameFinder.clearAdaptiveData();
+ }
+
+ String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+ spanNames = spanNames.substring(1, spanNames.length() - 1);
+ String[] tmp = spanNames.split(",");
+
+ for (String name : tmp) {
+ name = name.trim();
+ this.locationNameEntities.add(name);
+ }
+
+
+ }
+
+ /*
+ * Get the best location entity extracted from the input stream. Simply
+ * return the most frequent entity, If there several highest frequent
+ * entity, pick one randomly. May not be the optimal solution, but works.
+ *
+ * @param locationNameEntities OpenNLP name finder's results, stored in
+ * ArrayList
+ */
+ public void getBestNameEntity() {
+ if (this.locationNameEntities.size() == 0)
+ return;
+
+ for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+ if (tf.containsKey(this.locationNameEntities.get(i)))
+ tf.put(this.locationNameEntities.get(i),
+ tf.get(this.locationNameEntities.get(i)) + 1);
+ else
+ tf.put(this.locationNameEntities.get(i), 1);
+ }
+ int max = 0;
+ List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+ tf.entrySet());
+ Collections.shuffle(list);
+ Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+ public int compare(Map.Entry<String, Integer> o1,
+ Map.Entry<String, Integer> o2) {
+ // Descending Order
+ return o2.getValue().compareTo(o1.getValue());
+ }
+ });
+
+ this.locationNameEntities.clear();// update so that they are in
+ // descending order
+ for (Map.Entry<String, Integer> entry : list) {
+ this.locationNameEntities.add(entry.getKey());
+ if (entry.getValue() > max) {
+ max = entry.getValue();
+ this.bestNameEntity = entry.getKey();
+ }
+ }
+ }
+}