You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/01 15:23:45 UTC

[tika] branch TIKA-3179 updated (ff7d43a -> 4abe3d1)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3179
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from ff7d43a  TIKA-3179 -- refactor parser modules so that there's tall, grande and venti, er, tika-parsers, tika-parsers-extended and tika-parsers-advanced
     add 04d520c  TIKA-3183 -- move all version info to properties in tika-parent
     new 3a532a2  Merge remote-tracking branch 'origin/main' into TIKA-3179
     new 0571868  Git add files...no idea how this mv failed...
     new 4abe3d1  Git add files...no idea how this mv failed...

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-eval/pom.xml                                  |   4 +-
 tika-example/pom.xml                               |   8 +-
 tika-langdetect/pom.xml                            |   2 +-
 tika-parent/pom.xml                                | 131 ++-
 tika-parsers-advanced/tika-dl/pom.xml              |   8 +-
 .../tika/dl/imagerec/dl4j-inception3-config.xml    |  27 +-
 .../apache/tika/dl/imagerec/dl4j-vgg16-config.xml} |  10 +-
 .../parser/recognition/AgeRecogniserConfig.java    |  69 ++
 .../tika-parser-advancedmedia-module/pom.xml       |   4 +-
 .../apache/tika/parser/captioning/tf/im2txtapi.py  | 266 +++++++
 .../tika/parser/recognition/tf/inceptionapi.py     | 483 +++++++++++
 ...w-video-rest.xml => tika-config-tflow-rest.xml} |   7 +-
 .../recognition/ObjectRecognitionParserTest.java   | 199 +++++
 ...Test.java => TensorflowVideoRecParserTest.java} |  15 +-
 .../tika-parser-nlp-module/pom.xml                 |   6 +-
 .../parser/ctakes/CTAKESAnnotationProperty.java    |  35 +-
 .../tika/parser/geo/NameEntityExtractor.java       | 122 +++
 .../tika/parser/journal/GrobidRESTParser.java      | 116 +++
 .../apache/tika/parser/journal/TEIDOMParser.java   | 882 +++++++++++++++++++++
 .../tika/parser/ner/nltk/NLTKNERecogniser.java     | 147 ++++
 .../services/org.apache.tika.parser.Parser         |   4 +-
 .../grobid/GrobidServer.properties}                |   1 +
 .../org/apache/tika/parser/journal/TEITest.java    |  69 ++
 .../tika/parser/ner/NamedEntityParserTest.java     |  91 +++
 .../NLTKNERecogniserTest.java}                     |  24 +-
 .../apache/tika/parser/ner/opennlp/get-models.sh   |  18 +-
 .../tika/parser/ner/{ => regex}/tika-config.xml    |   0
 .../tika-parser-scientific-module/pom.xml          |   4 +-
 tika-parsers/tika-parser-apple-module/pom.xml      |   2 +-
 tika-parsers/tika-parser-audiovideo-module/pom.xml |   4 -
 tika-parsers/tika-parser-code-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-html-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-image-module/pom.xml      |   6 -
 tika-parsers/tika-parser-microsoft-module/pom.xml  |   2 +-
 tika-parsers/tika-parser-news-module/pom.xml       |   4 +-
 tika-parsers/tika-parser-pkg-module/pom.xml        |   6 -
 tika-parsers/tika-parser-text-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-xml-module/pom.xml        |   2 +-
 tika-server/pom.xml                                |   2 +-
 tika-translate/pom.xml                             |   4 +-
 tika-xmp/pom.xml                                   |   2 +-
 41 files changed, 2635 insertions(+), 157 deletions(-)
 copy tika-batch/src/test/resources/test-input/noisy_parsers/test0.xml => tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml (51%)
 copy tika-parsers-advanced/{tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-video-rest.xml => tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml} (79%)
 create mode 100644 tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
 create mode 100644 tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py
 create mode 100755 tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
 copy tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/{tika-config-tflow-video-rest.xml => tika-config-tflow-rest.xml} (89%)
 create mode 100644 tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
 copy tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/{TensorflowImageRecParserTest.java => TensorflowVideoRecParserTest.java} (78%)
 copy tika-parsers/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java => tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java (60%)
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
 copy {tika-parsers-extended/tika-parser-sqlite3-module => tika-parsers-advanced/tika-parser-nlp-module}/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (85%)
 copy tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/{journal/GrobidExtractor.properties => ner/grobid/GrobidServer.properties} (95%)
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java
 create mode 100644 tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
 copy tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/{regex/RegexNERecogniserTest.java => nltk/NLTKNERecogniserTest.java} (76%)
 copy tika-app/src/main/resources/log4j.properties => tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh (53%)
 mode change 100644 => 100755
 copy tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/{ => regex}/tika-config.xml (100%)

[tika] 01/03: Merge remote-tracking branch 'origin/main' into TIKA-3179

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3179
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3a532a2b40f0bac61102ef5a86acfc50b92d9c4a
Merge: ff7d43a 04d520c
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 1 11:00:15 2020 -0400

    Merge remote-tracking branch 'origin/main' into TIKA-3179
    
    # Conflicts:
    #	tika-parser-modules/pom.xml
    #	tika-parser-modules/tika-parser-db-module/pom.xml

 tika-eval/pom.xml                                  |   4 +-
 tika-example/pom.xml                               |   8 +-
 tika-langdetect/pom.xml                            |   2 +-
 tika-parent/pom.xml                                | 131 ++++++++++++++-------
 tika-parsers-advanced/tika-dl/pom.xml              |   8 +-
 .../tika-parser-advancedmedia-module/pom.xml       |   4 +-
 .../tika-parser-nlp-module/pom.xml                 |   6 +-
 .../tika-parser-scientific-module/pom.xml          |   4 +-
 tika-parsers/tika-parser-apple-module/pom.xml      |   2 +-
 tika-parsers/tika-parser-audiovideo-module/pom.xml |   4 -
 tika-parsers/tika-parser-code-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-html-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-image-module/pom.xml      |   6 -
 tika-parsers/tika-parser-microsoft-module/pom.xml  |   2 +-
 tika-parsers/tika-parser-news-module/pom.xml       |   4 +-
 tika-parsers/tika-parser-pkg-module/pom.xml        |   6 -
 tika-parsers/tika-parser-text-module/pom.xml       |   2 +-
 tika-parsers/tika-parser-xml-module/pom.xml        |   2 +-
 tika-server/pom.xml                                |   2 +-
 tika-translate/pom.xml                             |   4 +-
 tika-xmp/pom.xml                                   |   2 +-
 21 files changed, 118 insertions(+), 89 deletions(-)

[tika] 02/03: Git add files...no idea how this mv failed...

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3179
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 057186813a46b2d2a90a16945c1f5604d350caf5
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 1 11:22:42 2020 -0400

    Git add files...no idea how this mv failed...
---
 .../tika/dl/imagerec/dl4j-inception3-config.xml    |  35 +
 .../apache/tika/dl/imagerec/dl4j-vgg16-config.xml  |  32 +
 .../parser/recognition/AgeRecogniserConfig.java    |  69 ++
 .../apache/tika/parser/captioning/tf/im2txtapi.py  | 266 +++++++
 .../parser/recognition/tika-config-tflow-rest.xml  |  33 +
 .../recognition/ObjectRecognitionParserTest.java   | 199 +++++
 .../tf/TensorflowVideoRecParserTest.java           |  55 ++
 .../parser/ctakes/CTAKESAnnotationProperty.java    |  46 ++
 .../tika/parser/geo/NameEntityExtractor.java       | 122 +++
 .../tika/parser/journal/GrobidRESTParser.java      | 116 +++
 .../apache/tika/parser/journal/TEIDOMParser.java   | 882 +++++++++++++++++++++
 .../tika/parser/ner/nltk/NLTKNERecogniser.java     | 147 ++++
 .../services/org.apache.tika.parser.Parser         |  18 +
 .../tika/parser/ner/grobid/GrobidServer.properties |  17 +
 .../org/apache/tika/parser/journal/TEITest.java    |  69 ++
 .../tika/parser/ner/NamedEntityParserTest.java     |  91 +++
 .../tika/parser/ner/nltk/NLTKNERecogniserTest.java |  49 ++
 .../apache/tika/parser/ner/opennlp/get-models.sh   |  26 +
 .../apache/tika/parser/ner/regex/tika-config.xml   |  27 +
 19 files changed, 2299 insertions(+)

diff --git a/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
new file mode 100644
index 0000000..2728063
--- /dev/null
+++ b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+      <mime>image/jpeg</mime>
+      <params>
+        <param name="modelWeightsPath" type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/inception_v3_keras_2.h5</param>
+        <param name="labelFile" type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/imagenet_class_index.json</param>
+        <param name="topN" type="int">10</param>
+        <param name="minConfidence" type="double">0.015</param>
+        <param name="class" type="string">org.apache.tika.dl.imagerec.DL4JInceptionV3Net</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
new file mode 100644
index 0000000..940a4b6
--- /dev/null
+++ b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+            <mime>image/jpeg</mime>
+            <params>
+                <param name="topN" type="int">3</param>
+                <param name="minConfidence" type="double">0.015</param>
+                <param name="class" type="string">org.apache.tika.dl.imagerec.DL4JVGG16Net</param>
+                <param name="modelType" type="string">VGG16</param>
+                <param name="serialize" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java b/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
new file mode 100644
index 0000000..92427f4
--- /dev/null
+++ b/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition;
+
+import java.net.URL;
+import java.util.Map;
+
+import org.apache.tika.config.Param;
+
+
+/**
+ * Stores URL for AgePredictor 
+ */
+public class AgeRecogniserConfig {
+
+	private String pathClassifyModel = null;
+	private String pathClassifyRegression = null;
+
+	public AgeRecogniserConfig(Map<String, Param> params) {
+
+		URL classifyUrl = AgeRecogniserConfig.class.getResource(
+				params.get("age.path.classify").getValue().toString());
+
+		if (classifyUrl != null) {
+			setPathClassifyModel(classifyUrl.getFile());
+		}
+
+		URL regressionUrl = AgeRecogniserConfig.class.getResource(
+				params.get("age.path.regression").getValue().toString());
+
+		if (regressionUrl != null) {
+			setPathClassifyRegression(regressionUrl.getFile());
+		}
+	}
+
+	public String getPathClassifyModel() {
+		return pathClassifyModel;
+	}
+
+	public void setPathClassifyModel(String pathClassifyModel) {
+		this.pathClassifyModel = pathClassifyModel;
+	}
+
+	public String getPathClassifyRegression() {
+		return pathClassifyRegression;
+	}
+
+	public void setPathClassifyRegression(String pathClassifyRegression) {
+		this.pathClassifyRegression = pathClassifyRegression;
+	}
+    
+    
+   
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py
new file mode 100644
index 0000000..97f1f2a
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+
+"""
+    This script exposes image captioning service over a REST API. Image captioning implementation based on the paper,
+
+        "Show and Tell: A Neural Image Caption Generator"
+        Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan
+
+    For more details, please visit :
+        http://arxiv.org/abs/1411.4555
+    Requirements :
+      Flask
+      tensorflow
+      numpy
+      requests
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import logging
+import math
+import requests
+import sys
+
+from flask import Flask, request, Response, jsonify
+from io import BytesIO
+from PIL import Image
+from time import time
+
+import tensorflow as tf
+import xml.etree.ElementTree as ET
+
+import model_wrapper
+import vocabulary
+import caption_generator
+
+# turning off the traceback by limiting its depth
+sys.tracebacklimit = 0
+
+# informative log messages for advanced users to troubleshoot errors when modifying model_info.xml
+try:
+    info = ET.parse('/usr/share/apache-tika/models/dl/image/caption/model_info.xml').getroot()
+except IOError:
+    logging.exception('model_info.xml is not found')
+    sys.exit(1)
+
+model_main = info.find('model_main')
+if model_main is None:
+    logging.exception('<checkpoint_path> tag under <model_main> tag in model_info.xml is not found')
+    sys.exit(1)
+
+checkpoint_path = model_main.find('checkpoint_path')
+if checkpoint_path is None:
+    logging.exception('<checkpoint_path> tag under <model_main> tag in model_info.xml is not found')
+    sys.exit(1)
+else:
+    checkpoint_path = checkpoint_path.text
+
+vocab_file = model_main.find('vocab_file')
+if vocab_file is None:
+    logging.exception('<vocab_file> tag under <model_main> tag in model_info.xml is not found')
+    sys.exit(1)
+else:
+    vocab_file = vocab_file.text
+
+port = info.get('port')
+if port is None:
+    logging.exception('port attribute in <service> tag in model_info.xml is not found')
+    sys.exit(1)
+
+# turning on the traceback by setting it to default
+sys.tracebacklimit = 1000
+
+FLAGS = tf.flags.FLAGS
+tf.flags.DEFINE_string("checkpoint_path", checkpoint_path, """Directory containing the model checkpoint file.""")
+tf.flags.DEFINE_string('vocab_file', vocab_file, """Text file containing the vocabulary.""")
+tf.flags.DEFINE_integer('port', port, """Server PORT, default:8764""")
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+
+class Initializer(Flask):
+    """
+        Class to initialize the REST API, this class loads the model from the given checkpoint path in model_info.xml
+        and prepares a caption_generator object
+    """
+
+    def __init__(self, name):
+        super(Initializer, self).__init__(name)
+        # build the inference graph
+        g = tf.Graph()
+        with g.as_default():
+            model = model_wrapper.ModelWrapper()
+            restore_fn = model.build_graph(FLAGS.checkpoint_path)
+        g.finalize()
+        # make the model globally available
+        self.model = model
+        # create the vocabulary
+        self.vocab = vocabulary.Vocabulary(FLAGS.vocab_file)
+        self.sess = tf.Session(graph=g)
+        # load the model from checkpoint
+        restore_fn(self.sess)
+
+
+def current_time():
+    """Returns current time in milli seconds"""
+
+    return int(1000 * time())
+
+
+app = Initializer(__name__)
+
+
+def get_remote_file(url, success=200, timeout=10):
+    """
+        Given HTTP URL, this api gets the content of it
+        returns (Content-Type, image_content)
+    """
+    try:
+        app.logger.info("GET: %s" % url)
+        auth = None
+        res = requests.get(url, stream=True, timeout=timeout, auth=auth)
+        if res.status_code == success:
+            return res.headers.get('Content-Type', 'application/octet-stream'), res.raw.data
+    except:
+        pass
+    return None, None
+
+
+@app.route("/")
+def index():
+    """The index page which provide information about other API end points"""
+
+    return """
+    <div>
+    <h1> Image Captioning REST API </h1>
+    <h3> The following API end points are valid </h3>
+        <ul>
+            <h4> Inception V3 </h4>
+            <li> <code>/inception/v3/ping </code> - <br/>
+                <b> Description : </b> checks availability of the service. returns "pong" with status 200 when it is available
+            </li>
+            <li> <code>/inception/v3/caption/image</code> - <br/>
+                <table>
+                <tr><th align="left"> Description </th><td> This is a service that can caption images</td></tr>
+                <tr><th align="left"> How to supply Image Content </th></tr>
+                <tr><th align="left"> With HTTP GET : </th> <td>
+                    Include a query parameter <code>url </code> which is an http url of JPEG image <br/>
+                    Example: <code> curl "localhost:8764/inception/v3/caption/image?url=http://xyz.com/example.jpg"</code>
+                </td></tr>
+                <tr><th align="left"> With HTTP POST :</th><td>
+                    POST JPEG image content as binary data in request body. <br/>
+                    Example: <code> curl -X POST "localhost:8764/inception/v3/caption/image" --data-binary @example.jpg </code>
+                </td></tr>
+                </table>
+            </li>
+        <ul>
+    </div>
+    """
+
+
+@app.route("/inception/v3/ping", methods=["GET"])
+def ping_pong():
+    """API to do health check. If this says status code 200, then healthy"""
+
+    return "pong"
+
+
+@app.route("/inception/v3/caption/image", methods=["GET", "POST"])
+def caption_image():
+    """API to caption images"""
+    image_format = "not jpeg"
+
+    st = current_time()
+    # get beam_size
+    beam_size = int(request.args.get("beam_size", "3"))
+    # get max_caption_length
+    max_caption_length = int(request.args.get("max_caption_length", "20"))
+    # get image_data
+    if request.method == 'POST':
+        image_data = request.get_data()
+    else:
+        url = request.args.get("url")
+        c_type, image_data = get_remote_file(url)
+        if not image_data:
+            return Response(status=400, response=jsonify(error="Could not HTTP GET %s" % url))
+        if 'image/jpeg' in c_type:
+            image_format = "jpeg"
+
+    # use c_type to find whether image_format is jpeg or not
+    # if jpeg, don't convert
+    if image_format == "jpeg":
+        jpg_image = image_data
+    # if not jpeg
+    else:
+        # open the image from raw bytes
+        image = Image.open(BytesIO(image_data))
+        # convert the image to RGB format, otherwise will give errors when converting to jpeg, if the image isn't RGB
+        rgb_image = image.convert("RGB")
+        # convert the RGB image to jpeg
+        image_bytes = BytesIO()
+        rgb_image.save(image_bytes, format="jpeg", quality=95)
+        jpg_image = image_bytes.getvalue()
+        image_bytes.close()
+
+    read_time = current_time() - st
+    # restart counter
+    st = current_time()
+
+    generator = caption_generator.CaptionGenerator(app.model,
+                                                   app.vocab,
+                                                   beam_size=beam_size,
+                                                   max_caption_length=max_caption_length)
+    captions = generator.beam_search(app.sess, jpg_image)
+
+    captioning_time = current_time() - st
+    app.logger.info("Captioning time : %d" % captioning_time)
+
+    array_captions = []
+    for caption in captions:
+        sentence = [app.vocab.id_to_word(w) for w in caption.sentence[1:-1]]
+        sentence = " ".join(sentence)
+        array_captions.append({
+            'sentence': sentence,
+            'confidence': math.exp(caption.logprob)
+        })
+
+    response = {
+        'beam_size': beam_size,
+        'max_caption_length': max_caption_length,
+        'captions': array_captions,
+        'time': {
+            'read': read_time,
+            'captioning': captioning_time,
+            'units': 'ms'
+        }
+    }
+    return Response(response=json.dumps(response), status=200, mimetype="application/json")
+
+
+def main(_):
+    if not app.debug:
+        print("Serving on port %d" % FLAGS.port)
+    app.run(host="0.0.0.0", port=FLAGS.port)
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
new file mode 100644
index 0000000..69a65d0
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+            <mime>image/jpeg</mime>
+            <mime>image/png</mime>
+            <mime>image/gif</mime>
+            <params>
+                <param name="apiBaseUri" type="uri">http://localhost:8764/inception/v4</param>
+                <param name="topN" type="int">2</param>
+                <param name="minConfidence" type="double">0.015</param>
+                <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
new file mode 100644
index 0000000..25520af
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.List;
+
+/**
+ * Testcases for Object Recognition Parser
+ */
+public class ObjectRecognitionParserTest {
+
+    // Config files
+    private static final String CONFIG_FILE_OBJ_REC = "org/apache/tika/parser/recognition/tika-config-tflow.xml";
+    private static final String CONFIG_REST_FILE_OBJ_REC = "org/apache/tika/parser/recognition/tika-config-tflow-rest.xml";
+    private static final String CONFIG_REST_FILE_IM2TXT = "org/apache/tika/parser/recognition/tika-config-tflow-im2txt-rest.xml";
+
+    // Test images
+    private static final String CAT_IMAGE_JPEG = "test-documents/testJPEG.jpg";
+    private static final String CAT_IMAGE_PNG = "test-documents/testPNG.png";
+    private static final String CAT_IMAGE_GIF = "test-documents/testGIF.gif";
+
+    private static final String BASEBALL_IMAGE_JPEG = "test-documents/baseball.jpg";
+    private static final String BASEBALL_IMAGE_PNG = "test-documents/baseball.png";
+    private static final String BASEBALL_IMAGE_GIF = "test-documents/baseball.gif";
+
+    private static final ClassLoader loader = ObjectRecognitionParserTest.class.getClassLoader();
+
+    private static final Logger LOG = LoggerFactory.getLogger(ObjectRecognitionParserTest.class);
+    
+    @Test
+    public void jpegTFObjRecTest() throws IOException, TikaException, SAXException {
+      TensorflowImageRecParser p = new TensorflowImageRecParser();
+      Assume.assumeTrue(p.isAvailable());      
+        try (InputStream stream = loader.getResourceAsStream(CONFIG_FILE_OBJ_REC)) {
+            assert stream != null;
+            Tika tika = new Tika(new TikaConfig(stream));
+            Metadata metadata = new Metadata();
+            try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE_JPEG)) {
+                Reader reader = tika.parse(imageStream, metadata);
+                List<String> lines = IOUtils.readLines(reader);
+                String text = StringUtils.join(lines, " ");
+                String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+                String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY_OBJ_REC), " ");
+                for (String expectedObject : expectedObjects) {
+                    String message = "'" + expectedObject + "' must have been detected";
+                    Assert.assertTrue(message, text.contains(expectedObject));
+                    Assert.assertTrue(message, metaValues.contains(expectedObject));
+                }
+            }
+        }
+    }
+
+    @Test
+    public void jpegRESTObjRecTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v4/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+          status = WebClient.create(apiUrl).get().getStatus();
+          available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);
+        String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+        doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_JPEG,
+                ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+    }
+
+    @Test
+    public void pngRESTObjRecTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v4/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+            status = WebClient.create(apiUrl).get().getStatus();
+            available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);
+        String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+        doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_PNG,
+                ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+    }
+
+    @Test
+    public void gifRESTObjRecTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v4/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+            status = WebClient.create(apiUrl).get().getStatus();
+            available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);
+        String[] expectedObjects = {"Egyptian cat"};
+        doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_GIF,
+                ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+    }
+
+    @Test
+    public void jpegRESTim2txtTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v3/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+          status = WebClient.create(apiUrl).get().getStatus();
+          available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);   
+        String[] expectedCaption = {"a baseball player holding a bat on a field"};
+        doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_JPEG,
+                ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+    }
+
+    @Test
+    public void pngRESTim2txtTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v3/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+          status = WebClient.create(apiUrl).get().getStatus();
+          available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);  
+        String[] expectedCaption = {"a baseball player holding a bat on a field"};
+        doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_PNG,
+                ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+    }
+
+    @Test
+    public void gifRESTim2txtTest() throws Exception {
+        String apiUrl = "http://localhost:8764/inception/v3/ping";
+        boolean available = false;
+        int status = 500;
+        try{
+          status = WebClient.create(apiUrl).get().getStatus();
+          available = status == 200;
+        }
+        catch(Exception ignore){}
+        Assume.assumeTrue(available);  
+        String[] expectedCaption = {"a baseball player pitching a ball on top of a field"};
+        doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_GIF,
+                ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+    }
+
+    private void doRecognize(String configFile, String testImg, String mdKey, String[] expectedObjects) throws Exception {
+        try (InputStream stream = loader.getResourceAsStream(configFile)) {
+            assert stream != null;
+            Tika tika = new Tika(new TikaConfig(stream));
+            Metadata metadata = new Metadata();
+            try (InputStream imageStream = loader.getResourceAsStream(testImg)) {
+                Reader reader = tika.parse(imageStream, metadata);
+                String text = IOUtils.toString(reader);
+                String metaValues = StringUtils.join(metadata.getValues(mdKey), " ");
+                LOG.info("MetaValues = {}", metaValues);
+                for (String expectedObject : expectedObjects) {
+                    String message = "'" + expectedObject + "' must have been detected";
+                    Assert.assertTrue(message, text.contains(expectedObject));
+                    Assert.assertTrue(message, metaValues.contains(expectedObject));
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java
new file mode 100644
index 0000000..ded686a
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+@Ignore
+public class TensorflowVideoRecParserTest {
+
+    @Test
+    public void recognise() throws Exception {
+        TensorflowRESTVideoRecogniser recogniser = new TensorflowRESTVideoRecogniser();
+        recogniser.initialize(new HashMap<String, Param>());
+        try (InputStream stream = getClass().getClassLoader().getResourceAsStream("test-documents/testVideoMp4.mp4")) {
+            List<RecognisedObject> objects = recogniser.recognise(stream, new DefaultHandler(), new Metadata(), new ParseContext());
+            
+            Assert.assertTrue(objects.size() > 0);
+            Set<String> objectLabels = new HashSet<>();
+            for (RecognisedObject object : objects) {
+                objectLabels.add(object.getLabel());
+            }
+            Assert.assertTrue(objectLabels.size() > 0);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
new file mode 100644
index 0000000..1c1be02
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@link IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+    BEGIN("start"),
+    END("end"),
+    CONDITIONAL("conditional"),
+    CONFIDENCE("confidence"),
+    DISCOVERY_TECNIQUE("discoveryTechnique"),
+    GENERIC("generic"),
+    HISTORY_OF("historyOf"),
+    ID("id"),
+    ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+    POLARITY("polarity");
+
+    private String name;
+
+    CTAKESAnnotationProperty(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return name;
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java
new file mode 100644
index 0000000..c998e40
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.Span;
+
+public class NameEntityExtractor {
+    ArrayList<String> locationNameEntities;
+    String bestNameEntity;
+    private HashMap<String, Integer> tf;
+    private final NameFinderME nameFinder;
+
+    public NameEntityExtractor(NameFinderME nameFinder) throws IOException {
+        this.locationNameEntities = new ArrayList<String>();
+        this.bestNameEntity = null;
+        this.nameFinder = nameFinder;
+        this.tf = new HashMap<String, Integer>();
+    }
+
+    /*
+     * Use OpenNLP to extract location names that's appearing in the steam.
+     * OpenNLP's default Name Finder accuracy is not very good, please refer to
+     * its documentation.
+     * 
+     * @param stream stream that passed from this.parse()
+     */
+    public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
+        String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+        Span nameE[];
+        
+        //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+        synchronized (nameFinder) {
+            nameE = nameFinder.find(in);
+            //the same name finder is reused, so clear adaptive data
+            nameFinder.clearAdaptiveData();
+        }
+
+        String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+        spanNames = spanNames.substring(1, spanNames.length() - 1);
+        String[] tmp = spanNames.split(",");
+
+        for (String name : tmp) {
+            name = name.trim();
+            this.locationNameEntities.add(name);
+        }
+
+
+    }
+
+    /*
+     * Get the best location entity extracted from the input stream. Simply
+     * return the most frequent entity, If there several highest frequent
+     * entity, pick one randomly. May not be the optimal solution, but works.
+     * 
+     * @param locationNameEntities OpenNLP name finder's results, stored in
+     * ArrayList
+     */
+    public void getBestNameEntity() {
+        if (this.locationNameEntities.size() == 0)
+            return;
+
+        for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+            if (tf.containsKey(this.locationNameEntities.get(i)))
+                tf.put(this.locationNameEntities.get(i),
+                        tf.get(this.locationNameEntities.get(i)) + 1);
+            else
+                tf.put(this.locationNameEntities.get(i), 1);
+        }
+        int max = 0;
+        List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+                tf.entrySet());
+        Collections.shuffle(list);
+        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+            public int compare(Map.Entry<String, Integer> o1,
+                    Map.Entry<String, Integer> o2) {
+                // Descending Order
+                return o2.getValue().compareTo(o1.getValue());
+            }
+        });
+
+        this.locationNameEntities.clear();// update so that they are in
+                                          // descending order
+        for (Map.Entry<String, Integer> entry : list) {
+            this.locationNameEntities.add(entry.getKey());
+            if (entry.getValue() > max) {
+                max = entry.getValue();
+                this.bestNameEntity = entry.getKey();
+            }
+        }
+    }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
new file mode 100644
index 0000000..110c504
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
+
+
+    private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+    private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+    // doesn't work
+    // nfc why
+
+    private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+
+    private String restHostUrlStr;
+
+    public GrobidRESTParser() {
+        String restHostUrlStr = null;
+        try {
+            restHostUrlStr = readRestUrl();
+        } catch (IOException e) {
+            LOG.warn("can't read rest url", e);
+        }
+
+        if (restHostUrlStr == null
+                || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+            this.restHostUrlStr = GROBID_REST_HOST;
+        } else {
+            this.restHostUrlStr = restHostUrlStr;
+        }
+    }
+
+    public void parse(String filePath, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws FileNotFoundException {
+
+        File pdfFile = new File(filePath);
+        ContentDisposition cd = new ContentDisposition(
+                "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+        Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+        MultipartBody body = new MultipartBody(att);
+
+        Response response = WebClient
+                .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+                .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+                .post(body);
+
+        try {
+            String resp = response.readEntity(String.class);
+            Metadata teiMet = new TEIDOMParser().parse(resp, context);
+            for (String key : teiMet.names()) {
+                metadata.add("grobid:header_" + key, teiMet.get(key));
+            }
+        } catch (Exception e) {
+            LOG.warn("Couldn't read response", e);
+        }
+    }
+
+    private static String readRestUrl() throws IOException {
+        Properties grobidProperties = new Properties();
+        grobidProperties.load(GrobidRESTParser.class
+                .getResourceAsStream("GrobidExtractor.properties"));
+
+        return grobidProperties.getProperty("grobid.server.url");
+    }
+
+    protected static boolean canRun() {
+        Response response = null;
+
+        try {
+            response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
+                    .accept(MediaType.TEXT_HTML).get();
+            String resp = response.readEntity(String.class);
+            return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+        } catch (Exception e) {
+            //swallow...can't run
+            return false;
+        }
+    }
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
new file mode 100644
index 0000000..b79ec93
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
@@ -0,0 +1,882 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class TEIDOMParser {
+
+    public TEIDOMParser() {
+    }
+
+    public Metadata parse(String source, ParseContext parseContext) throws TikaException, SAXException, IOException {
+
+        Document root = XMLReaderUtils.buildDOM(
+                new ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8)), parseContext);
+
+        Metadata metadata = new Metadata();
+        createGrobidMetadata(source, root.getDocumentElement(), metadata);
+        return metadata;
+    }
+
+    private void createGrobidMetadata(String source, Element root,
+                                      Metadata metadata) {
+        if (root != null) {
+
+            Node text = getFirstChild(root.getChildNodes(), "text");
+            if (text != null) {
+                parseText(text, metadata);
+            }
+            Node teiHeader = getFirstChild(root.getChildNodes(), "teiHeader");
+            Node fileDesc = getFirstChild(teiHeader.getChildNodes(), "fileDesc");
+            if (fileDesc != null) {
+                parseFileDesc(fileDesc, metadata);
+
+            }
+            Node profileDesc = getFirstChild(teiHeader.getChildNodes(), "profileDesc");
+            if (profileDesc != null) {
+                parseProfileDesc(profileDesc, metadata);
+            }
+
+        }
+
+        addStaticMet(source, root, metadata);
+    }
+
+    private void addStaticMet(String source, Element obj, Metadata metadata) {
+        metadata.add("Class", Metadata.class.getName());
+        //no longer available after we got rid of json.org's and its .toJSONObject()
+//        metadata.add("TEIJSONSource", obj.toString());
+        metadata.add("TEIXMLSource", source);
+    }
+
+    private void parseText(Node text, Metadata metadata) {
+        String lang = getFirstAttribute(text, "xml", "lang");
+        if (lang != null) {
+            metadata.add("Language", lang);
+        }
+    }
+
+    private void parseFileDesc(Node fileDesc, Metadata metadata) {
+        Node titleStmt = getFirstChild(fileDesc.getChildNodes(), "titleStmt");
+
+        if (titleStmt != null) {
+            parseTitleStmt(titleStmt, metadata);
+        }
+
+        Node sourceDesc = getFirstChild(fileDesc.getChildNodes(), "sourceDesc");
+        if (sourceDesc != null) {
+            parseSourceDesc(sourceDesc, metadata);
+        }
+    }
+
+    private void parseTitleStmt(Node titleStmt, Metadata metadata) {
+        Node title = getFirstChild(titleStmt.getChildNodes(), "title");
+        if (title != null) {
+            String titleText = title.getTextContent();
+            if (titleText != null) {
+                metadata.add("Title", titleText);
+            }
+        }
+    }
+
+    private void parseSourceDesc(Node sourceDesc, Metadata metadata) {
+        Node biblStruct = getFirstChild(sourceDesc.getChildNodes(), "biblStruct");
+        if (biblStruct != null) {
+            parseBiblStruct(biblStruct, metadata);
+        }
+    }
+
+    private void parseBiblStruct(Node biblStruct, Metadata metadata) {
+
+        Node analytic = getFirstChild(biblStruct.getChildNodes(), "analytic");
+        if (analytic != null) {
+            List<Node> authorNodes = getChildNodes(analytic.getChildNodes(), "author");
+            List<Author> authorList = new ArrayList<>();
+            for (Node authorNode : authorNodes) {
+                parseAuthor(authorNode, authorList);
+            }
+
+            metadata.add("Address", getMetadataAddresses(authorList));
+            metadata.add("Affiliation", getMetadataAffiliations(authorList));
+            metadata.add("Authors", getMetadataAuthors(authorList));
+            metadata.add("FullAffiliations",
+                    getMetadataFullAffiliations(authorList));
+
+
+        } else {
+            metadata.add("Error", "Unable to parse: no analytic section in JSON");
+        }
+
+    }
+
+    private String getMetadataFullAffiliations(List<Author> authorList) {
+        List<Affiliation> unique = new ArrayList<Affiliation>();
+        StringBuilder metAffils = new StringBuilder();
+
+        for (Author a : authorList) {
+            for (Affiliation af : a.getAffiliations()) {
+                if (!unique.contains(af)) {
+                    unique.add(af);
+                }
+            }
+        }
+        metAffils.append("[");
+        for (Affiliation af : unique) {
+            metAffils.append(af.toString());
+            metAffils.append(",");
+        }
+        metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+        metAffils.append("]");
+        return metAffils.toString();
+    }
+
+    private String getMetadataAuthors(List<Author> authorList) {
+        // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+        // Steve Hughes 1
+        List<Affiliation> unique = new ArrayList<Affiliation>();
+        StringBuilder metAuthors = new StringBuilder();
+
+        for (Author a : authorList) {
+            for (Affiliation af : a.getAffiliations()) {
+                if (!unique.contains(af)) {
+                    unique.add(af);
+                }
+            }
+        }
+
+        for (Author a : authorList) {
+            metAuthors.append(printOrBlank(a.getFirstName()));
+            metAuthors.append(printOrBlank(a.getMiddleName()));
+            metAuthors.append(printOrBlank(a.getSurName()));
+
+            StringBuilder affilBuilder = new StringBuilder();
+            for (int idx = 0; idx < unique.size(); idx++) {
+                Affiliation af = unique.get(idx);
+                if (a.getAffiliations().contains(af)) {
+                    affilBuilder.append((idx + 1));
+                    affilBuilder.append(",");
+                }
+            }
+
+            if (affilBuilder.length() > 0)
+                affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+            metAuthors.append(affilBuilder.toString());
+            metAuthors.append(" ");
+        }
+
+        return metAuthors.toString();
+    }
+
+    private String getMetadataAffiliations(List<Author> authorList) {
+        // generates 1 Jet Propulsion Laboratory California Institute of Technology
+        // ; 2 Computer Science Department University of Southern California
+        List<Affiliation> unique = new ArrayList<Affiliation>();
+        StringBuilder metAffil = new StringBuilder();
+
+        for (Author a : authorList) {
+            for (Affiliation af : a.getAffiliations()) {
+                if (!unique.contains(af)) {
+                    unique.add(af);
+                }
+            }
+        }
+
+        int count = 1;
+        for (Affiliation a : unique) {
+            metAffil.append(count);
+            metAffil.append(" ");
+            metAffil.append(a.getOrgName().toString());
+            metAffil.deleteCharAt(metAffil.length() - 1);
+            metAffil.append("; ");
+            count++;
+        }
+
+        if (count > 1) {
+            metAffil.deleteCharAt(metAffil.length() - 1);
+            metAffil.deleteCharAt(metAffil.length() - 1);
+        }
+
+        return metAffil.toString();
+    }
+
+    private String getMetadataAddresses(List<Author> authorList) {
+        // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+        List<Address> unique = new ArrayList<Address>();
+        StringBuilder metAddress = new StringBuilder();
+
+        for (Author a : authorList) {
+            for (Affiliation af : a.getAffiliations()) {
+                if (!unique.contains(af.getAddress())) {
+                    unique.add(af.getAddress());
+                }
+            }
+        }
+
+        for (Address ad : unique) {
+            metAddress.append(ad.toString());
+            metAddress.append(" ");
+        }
+
+        return metAddress.toString();
+    }
+
+    private void parseAuthor(Node authorNode, List<Author> authorList) {
+        Author author = new Author();
+        Node persName = getFirstChild(authorNode.getChildNodes(), "persName");
+        if (persName != null) {
+            List<Node> forenames = getChildNodes(persName.getChildNodes(), "forename");
+            for (Node forenameNode : forenames) {
+                parseNamePart(forenameNode, author);
+            }
+            Node surnameNode = getFirstChild(persName.getChildNodes(), "surname");
+            if (surnameNode != null) {
+                String surnameContent = surnameNode.getTextContent();
+                if (surnameContent != null) {
+                    author.setSurName(surnameContent);
+                }
+            }
+        }
+        List<Node> affiliationNodes = getChildNodes(authorNode.getChildNodes(), "affiliation");
+        for (Node affiliationNode : affiliationNodes) {
+            parseOneAffiliation(affiliationNode, author);
+        }
+
+
+        authorList.add(author);
+    }
+
+    private void parseNamePart(Node namePart, Author author) {
+        String type = getFirstAttribute(namePart, null, "type");
+        String content = namePart.getTextContent();
+        if (type != null && content != null) {
+
+            if (type.equals("first")) {
+                author.setFirstName(content);
+            }
+
+            if (type.equals("middle")) {
+                author.setMiddleName(content);
+            }
+        }
+    }
+
+    private void parseOneAffiliation(Node affiliationNode, Author author) {
+
+        Affiliation affiliation = new Affiliation();
+        Node address = getFirstChild(affiliationNode.getChildNodes(), "address");
+        if (address != null) {
+            parseAddress(address, affiliation);
+        }
+
+        List<Node> orgNameNodes = getChildNodes(affiliationNode.getChildNodes(), "orgName");
+        OrgName orgName = new OrgName();
+        for (Node orgNameNode : orgNameNodes) {
+            parseOrgName(orgNameNode, orgName);
+        }
+        affiliation.setOrgName(orgName);
+
+        author.getAffiliations().add(affiliation);
+    }
+
+    private void parseAddress(Node addressNode, Affiliation affiliation) {
+        Address address = new Address();
+        Node region = getFirstChild(addressNode.getChildNodes(), "region");
+        if (region != null && region.getTextContent() != null) {
+            address.setRegion(region.getTextContent());
+        }
+        Node postCode = getFirstChild(addressNode.getChildNodes(), "postCode");
+        if (postCode != null && postCode.getTextContent() != null) {
+            address.setPostCode(postCode.getTextContent());
+        }
+        Node settlementNode = getFirstChild(addressNode.getChildNodes(), "settlement");
+        if (settlementNode != null && settlementNode.getTextContent() != null) {
+            address.setSettlment(settlementNode.getTextContent());
+        }
+
+        Node countryNode = getFirstChild(addressNode.getChildNodes(), "country");
+        if (countryNode != null) {
+            Country country = new Country();
+            String key = getFirstAttribute(countryNode, null, "key");
+            if (key != null) {
+                country.setKey(key);
+            }
+            String content = countryNode.getTextContent();
+            if (content != null) {
+                country.setContent(content);
+            }
+            address.setCountry(country);
+        }
+
+        affiliation.setAddress(address);
+    }
+
+    private void parseOrgName(Node orgNode, OrgName orgName) {
+        OrgTypeName typeName = new OrgTypeName();
+        String orgContent = orgNode.getTextContent();
+        if (orgContent != null) {
+            typeName.setName(orgContent);
+        }
+        String orgType = getFirstAttribute(orgNode, null, "type");
+        if (orgType != null) {
+            typeName.setType(orgType);
+        }
+
+        orgName.getTypeNames().add(typeName);
+    }
+
+    private void parseProfileDesc(Node profileDesc, Metadata metadata) {
+        Node abstractNode = getFirstChild(profileDesc.getChildNodes(), "abstract");
+        if (abstractNode != null) {
+            Node pNode = getFirstChild(abstractNode.getChildNodes(), "p");
+            if (pNode != null) {
+                metadata.add("Abstract", pNode.getTextContent());
+            }
+        }
+
+        Node textClassNode = getFirstChild(profileDesc.getChildNodes(), "textClass");
+        if (textClassNode != null) {
+            Node keywordsNode = getFirstChild(textClassNode.getChildNodes(), "keywords");
+            if (keywordsNode != null) {
+                List<Node> terms = getChildNodes(keywordsNode.getChildNodes(), "term");
+                if (terms.size() == 0) {
+                    // test AJ15.pdf
+                    metadata.add("Keyword", keywordsNode.getTextContent());
+                } else {
+                    for (Node term : terms) {
+                        metadata.add("Keyword", term.getTextContent());
+                    }
+                }
+
+            }
+        }
+
+    }
+
+    private String printOrBlank(String val) {
+        if (val != null && !val.equals("")) {
+            return val + " ";
+        } else
+            return " ";
+    }
+
+    class Author {
+
+        private String surName;
+
+        private String middleName;
+
+        private String firstName;
+
+        private List<Affiliation> affiliations;
+
+        public Author() {
+            this.surName = null;
+            this.middleName = null;
+            this.firstName = null;
+            this.affiliations = new ArrayList<Affiliation>();
+        }
+
+        /**
+         * @return the surName
+         */
+        public String getSurName() {
+            return surName;
+        }
+
+        /**
+         * @param surName the surName to set
+         */
+        public void setSurName(String surName) {
+            this.surName = surName;
+        }
+
+        /**
+         * @return the middleName
+         */
+        public String getMiddleName() {
+            return middleName;
+        }
+
+        /**
+         * @param middleName the middleName to set
+         */
+        public void setMiddleName(String middleName) {
+            this.middleName = middleName;
+        }
+
+        /**
+         * @return the firstName
+         */
+        public String getFirstName() {
+            return firstName;
+        }
+
+        /**
+         * @param firstName the firstName to set
+         */
+        public void setFirstName(String firstName) {
+            this.firstName = firstName;
+        }
+
+        /**
+         * @return the affiliations
+         */
+        public List<Affiliation> getAffiliations() {
+            return affiliations;
+        }
+
+        /**
+         * @param affiliations the affiliations to set
+         */
+        public void setAffiliations(List<Affiliation> affiliations) {
+            this.affiliations = affiliations;
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#toString()
+         */
+        @Override
+        public String toString() {
+            return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+                    : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+                    + "]";
+        }
+
+    }
+
+    class Affiliation {
+
+        private OrgName orgName;
+
+        private Address address;
+
+        public Affiliation() {
+            this.orgName = new OrgName();
+            this.address = new Address();
+        }
+
+        /**
+         * @return the orgName
+         */
+        public OrgName getOrgName() {
+            return orgName;
+        }
+
+        /**
+         * @param orgName the orgName to set
+         */
+        public void setOrgName(OrgName orgName) {
+            this.orgName = orgName;
+        }
+
+        /**
+         * @return the address
+         */
+        public Address getAddress() {
+            return address;
+        }
+
+        /**
+         * @param address the address to set
+         */
+        public void setAddress(Address address) {
+            this.address = address;
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#equals(java.lang.Object)
+         */
+        @Override
+        public boolean equals(Object obj) {
+            Affiliation otherA = (Affiliation) obj;
+            return this.getAddress().equals(otherA.getAddress())
+                    && this.getOrgName().equals(otherA.getOrgName());
+
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#toString()
+         */
+        @Override
+        public String toString() {
+            return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+        }
+
+    }
+
+    class OrgName {
+        private List<OrgTypeName> typeNames;
+
+        public OrgName() {
+            this.typeNames = new ArrayList<OrgTypeName>();
+        }
+
+        /**
+         * @return the typeNames
+         */
+        public List<OrgTypeName> getTypeNames() {
+            return typeNames;
+        }
+
+        /**
+         * @param typeNames the typeNames to set
+         */
+        public void setTypeNames(List<OrgTypeName> typeNames) {
+            this.typeNames = typeNames;
+        }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+
+        @Override
+        public String toString() {
+            StringBuilder builder = new StringBuilder();
+            for (OrgTypeName on : this.typeNames) {
+                builder.append(on.getName());
+                builder.append(" ");
+            }
+            return builder.toString();
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#equals(java.lang.Object)
+         */
+        @Override
+        public boolean equals(Object obj) {
+            OrgName otherA = (OrgName) obj;
+
+            if (otherA.getTypeNames() != null) {
+                if (this.typeNames == null) {
+                    return false;
+                } else {
+                    return this.typeNames.size() == otherA.getTypeNames().size();
+                }
+            } else {
+                if (this.typeNames == null) {
+                    return true;
+                } else
+                    return false;
+            }
+
+        }
+
+    }
+
+    class OrgTypeName {
+        private String name;
+        private String type;
+
+        public OrgTypeName() {
+            this.name = null;
+            this.type = null;
+        }
+
+        /**
+         * @return the name
+         */
+        public String getName() {
+            return name;
+        }
+
+        /**
+         * @param name the name to set
+         */
+        public void setName(String name) {
+            this.name = name;
+        }
+
+        /**
+         * @return the type
+         */
+        public String getType() {
+            return type;
+        }
+
+        /**
+         * @param type the type to set
+         */
+        public void setType(String type) {
+            this.type = type;
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#equals(java.lang.Object)
+         */
+        @Override
+        public boolean equals(Object obj) {
+            OrgTypeName otherOrgName = (OrgTypeName) obj;
+            return this.type.equals(otherOrgName.getType())
+                    && this.name.equals(otherOrgName.getName());
+        }
+
+    }
+
+    private class Address {
+
+        private String region;
+        private String postCode;
+        private String settlment;
+        private Country country;
+
+        public Address() {
+            this.region = null;
+            this.postCode = null;
+            this.settlment = null;
+            this.country = new Country();
+        }
+
+        /**
+         * @return the region
+         */
+        public String getRegion() {
+            return region;
+        }
+
+        /**
+         * @param region the region to set
+         */
+        public void setRegion(String region) {
+            this.region = region;
+        }
+
+        /**
+         * @return the postCode
+         */
+        public String getPostCode() {
+            return postCode;
+        }
+
+        /**
+         * @param postCode the postCode to set
+         */
+        public void setPostCode(String postCode) {
+            this.postCode = postCode;
+        }
+
+        /**
+         * @return the settlment
+         */
+        public String getSettlment() {
+            return settlment;
+        }
+
+        /**
+         * @param settlment the settlment to set
+         */
+        public void setSettlment(String settlment) {
+            this.settlment = settlment;
+        }
+
+        /**
+         * @return the country
+         */
+        public Country getCountry() {
+            return country;
+        }
+
+        /**
+         * @param country the country to set
+         */
+        public void setCountry(Country country) {
+            this.country = country;
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#equals(java.lang.Object)
+         */
+        @Override
+        public boolean equals(Object obj) {
+            Address otherA = (Address) obj;
+            if (this.settlment == null) {
+                return otherA.getSettlment() == null;
+            } else if (this.country == null) {
+                return otherA.getCountry() == null;
+            } else if (this.postCode == null) {
+                return otherA.getPostCode() == null;
+            } else if (this.region == null) {
+                return otherA.getRegion() == null;
+            }
+
+            return this.settlment.equals(otherA.getSettlment())
+                    && this.country.equals(otherA.getCountry())
+                    && this.postCode.equals(otherA.getPostCode())
+                    && this.region.equals(otherA.getRegion());
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#toString()
+         */
+        @Override
+        public String toString() {
+            StringBuilder builder = new StringBuilder();
+            builder.append(settlment);
+            builder.append(", ");
+            builder.append(region);
+            builder.append(" ");
+            builder.append(postCode);
+            builder.append(" ");
+            builder.append(country.getContent());
+            return builder.toString();
+        }
+    }
+
+    private class Country {
+        private String key;
+        private String content;
+
+        public Country() {
+            this.key = null;
+            this.content = null;
+        }
+
+        /**
+         * @return the key
+         */
+        public String getKey() {
+            return key;
+        }
+
+        /**
+         * @param key the key to set
+         */
+        public void setKey(String key) {
+            this.key = key;
+        }
+
+        /**
+         * @return the content
+         */
+        public String getContent() {
+            return content;
+        }
+
+        /**
+         * @param content the content to set
+         */
+        public void setContent(String content) {
+            this.content = content;
+        }
+
+        /*
+         * (non-Javadoc)
+         *
+         * @see java.lang.Object#equals(java.lang.Object)
+         */
+        @Override
+        public boolean equals(Object obj) {
+            Country otherC = (Country) obj;
+
+            if (this.key == null) {
+                if (otherC.getKey() != null) {
+                    return false;
+                } else {
+                    if (this.content == null) {
+                        if (otherC.getContent() != null) {
+                            return false;
+                        } else {
+                            return true;
+                        }
+                    } else {
+                        return content.equals(otherC.getContent());
+                    }
+                }
+            } else {
+                if (this.content == null) {
+                    if (otherC.getContent() != null) {
+                        return false;
+                    } else {
+                        return this.key.equals(otherC.getKey());
+                    }
+                } else {
+                    return this.key.equals(otherC.getKey())
+                            && this.content.equals(otherC.getContent());
+                }
+            }
+        }
+    }
+
+    //returns first child with this name, null otherwise
+    private static Node getFirstChild(NodeList childNodes, String name) {
+        for (int i = 0; i < childNodes.getLength(); i++) {
+            Node n = childNodes.item(i);
+            if (n.getNodeName().equals(name)) {
+                return n;
+            }
+        }
+        return null;
+    }
+
+    private static String getFirstAttribute(Node node, String ns, String name) {
+        if (node.hasAttributes()) {
+            NamedNodeMap attrs = node.getAttributes();
+            for (int i = 0; i < attrs.getLength(); i++) {
+                Node attr = attrs.item(i);
+                if (attr.getLocalName().equals(name)) {
+                    return attr.getNodeValue();
+                }
+            }
+        }
+        return null;
+    }
+
+    private static List<Node> getChildNodes(NodeList childNodes, String localName) {
+        List<Node> ret = new ArrayList<>();
+        for (int i = 0; i < childNodes.getLength(); i++) {
+            Node child = childNodes.item(i);
+            if (child.getLocalName() != null && child.getLocalName().equals(localName)) {
+                ret.add(child);
+            }
+        }
+        return ret;
+    }
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..0e3d2d7
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collection;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Properties;
+import java.util.Iterator;
+import java.util.Locale;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  ne_chunk() module of NLTK. This NER requires additional setup,
+ *  due to Http requests to an endpoint server that runs NLTK.
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+    private static boolean available = false;
+    private static final String NLTK_REST_HOST = "http://localhost:8881";
+    private String restHostUrlStr;
+     /**
+     * some common entities identified by NLTK
+     */
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add("NAMES");
+    }};
+
+
+    public NLTKNERecogniser(){
+        try {
+
+            String restHostUrlStr="";
+            try {
+                restHostUrlStr = readRestUrl();
+            } catch (IOException e) {
+                LOG.warn("Can't read rest url", e);
+            }
+
+            if (restHostUrlStr == null || restHostUrlStr.equals("")) {
+                this.restHostUrlStr = NLTK_REST_HOST;
+            } else {
+                this.restHostUrlStr = restHostUrlStr;
+            }
+
+            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+            int responseCode = response.getStatus();
+            if(responseCode == 200){
+                available = true;
+            }
+            else{
+                LOG.info("NLTKRest Server is not running");
+            }
+
+        } catch (Exception e) {
+            LOG.warn(e.getMessage(), e);
+        }
+    }
+
+    private static String readRestUrl() throws IOException {
+        Properties nltkProperties = new Properties();
+        nltkProperties.load(NLTKNERecogniser.class
+                .getResourceAsStream("NLTKServer.properties"));
+
+        return nltkProperties.getProperty("nltk.server.url");
+    }
+
+    /**
+     * @return {@code true} if server endpoint is available.
+     * returns {@code false} if server endpoint is not avaliable for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -&gt; set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> entities = new HashMap<>();
+        try {
+            String url = restHostUrlStr + "/nltk";
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+            int responseCode = response.getStatus();
+            if (responseCode == 200) {
+                String result = response.readEntity(String.class);
+                JSONParser parser = new JSONParser();
+                JSONObject j = (JSONObject) parser.parse(result);
+                Iterator<?> keys = j.keySet().iterator();
+                while( keys.hasNext() ) {
+                    String key = (String)keys.next();
+                    if ( !key.equals("result") ) {
+                        ENTITY_TYPES.add(key);
+                        entities.put(key.toUpperCase(Locale.ENGLISH), new HashSet((Collection) j.get(key)));
+                    }
+                }
+            }
+        }
+        catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+
+        return entities;
+    }
+
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..cea7492
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.geo.GeoParser
+org.apache.tika.parser.journal.JournalParser
+org.apache.tika.parser.sentiment.SentimentAnalysisParser
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
new file mode 100644
index 0000000..a7718ab
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+grobid.server.url=http://localhost:8080
+grobid.endpoint.text=/processQuantityText
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java
new file mode 100644
index 0000000..2b82af9
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class TEITest extends TikaTest {
+
+
+    @Test
+    public void testBasic() throws Exception {
+        TEIDOMParser teiParser = new TEIDOMParser();
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (InputStream is = getResourceAsStream("/test-documents/testTEI.xml")) {
+            IOUtils.copy(is, bos);
+        }
+        String xml = new String (bos.toByteArray(), StandardCharsets.UTF_8);
+        Metadata metadata = teiParser.parse(xml, new ParseContext());
+        assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                "Montbonnot Saint-Martin, null 38330, 38330, 38330, 38330 " +
+                "France, France, France, France ", metadata.get("Address"));
+        String[] keywords = new String[]{
+                "F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms and Problems\u2014Sequencing",
+                "and scheduling; D41 [Operating Systems]: Process management\u2014Scheduling, Concurrency",
+                "Keywords",
+                "Parallel Computing, Algorithms, Scheduling, Parallel Tasks,",
+                "Moldable Tasks, Bi-criteria"
+        };
+        assertArrayEquals(keywords, metadata.getValues("Keyword"));
+        assertEquals("Pierre-François  Dutot 1 Lionel  Eyraud 1 Grégory  Gr´ 1 Grégory  Mouní 1 Denis  Trystram 1 ",
+                metadata.get("Authors"));
+        assertEquals("Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *",
+                metadata.get("Title"));
+        assertEquals("1 ID-IMAG ID-IMAG ID-IMAG ID-IMAG", metadata.get("Affiliation"));
+        assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+                        "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                        "null 38330, 38330, 38330, 38330 France, France, France, France}" +
+                        "[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+                        "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                        "null 38330, 38330, 38330, 38330 France, France, France, France}]",
+                metadata.get("FullAffiliations"));
+    }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
new file mode 100644
index 0000000..6e17415
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assume.assumeTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest extends TikaTest {
+
+    public static final String CONFIG_FILE = "tika-config.xml";
+
+    @Test
+    public void testParse() throws Exception {
+
+        //test config is added to resources directory
+        TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "I am student at University of Southern California (USC)," +
+                " located in Los Angeles . USC's football team is called by name Trojans." +
+                " Mr. John McKay was a head coach of the team from 1960 - 1975";
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+        HashSet<String> set = new HashSet<String>();
+        set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+        assumeTrue(set.contains(NamedEntityParser.class.getName()));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+        assumeTrue(set.contains("John McKay"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+        assumeTrue(set.contains("Los Angeles"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+        assumeTrue(set.contains("University of Southern California"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+        assumeTrue(set.contains("1960 - 1975"));
+
+    }
+
+    @Test
+    public void testNerChain() throws Exception {
+        String classNames = OpenNLPNERecogniser.class.getName()
+                + "," + RegexNERecogniser.class.getName();
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+        TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "University of Southern California (USC), is located in Los Angeles ." +
+                " Campus is busy from monday to saturday";
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+        HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+        assumeTrue(keys.contains("NER_WEEK_DAY"));
+        assumeTrue(keys.contains("NER_LOCATION"));
+
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..4b0101e
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+    @Test
+    public void testGetEntityTypes() throws Exception {
+        String text = "America is a big country.";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+
+        Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+        Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+        if(names.size() != 0) {
+            assertTrue(names.contains("America"));
+            assertTrue(names.size() == 1); 
+        }
+    }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
new file mode 100755
index 0000000..c17899e
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Getting OpenNLP NER models"
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-person.bin" -O ner-person.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin" -O ner-location.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-organization.bin" -O ner-organization.bin
+
+# Additional 4
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-date.bin" -O ner-date.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-money.bin" -O ner-money.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-time.bin" -O ner-time.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-percentage.bin" -O ner-percentage.bin
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml
new file mode 100644
index 0000000..267c399
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+            <mime>text/plain</mime>
+            <mime>text/html</mime>
+            <mime>application/xhtml+xml</mime>
+        </parser>
+    </parsers>
+
+</properties>
\ No newline at end of file

[tika] 03/03: Git add files...no idea how this mv failed...

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3179
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4abe3d1b7447953a371497a429e940de89b09dd9
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 1 11:23:10 2020 -0400

    Git add files...no idea how this mv failed...
---
 .../tika/parser/recognition/tf/inceptionapi.py     | 483 +++++++++++++++++++++
 1 file changed, 483 insertions(+)

diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
new file mode 100755
index 0000000..09d830c
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
@@ -0,0 +1,483 @@
+#!/usr/bin/env python
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+"""
+    Image classification with Inception.
+
+    This script exposes the tensorflow's inception classification service over REST API.
+
+    For more details, visit:
+        https://tensorflow.org/tutorials/image_recognition/
+
+    Requirements :
+      Flask
+      tensorflow
+      numpy
+      requests
+      pillow
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import json
+import logging
+import requests
+
+from flask import Flask, request, Response, jsonify
+from io import BytesIO
+from logging.handlers import RotatingFileHandler
+from PIL import Image
+from time import time
+
+import tensorflow as tf
+
+from inception_v4 import default_image_size, inception_v4_arg_scope, inception_v4
+
+try:
+    # This import is placed inside here to ensure that video_util and OpenCV is not required for image recognition APIs
+    from video_util import get_center_frame, get_frames_interval, get_n_frames
+except:
+    print("Can't import video libraries, No video functionality is available")
+
+json.encoder.FLOAT_REPR = lambda o: format(o, '.2f')  # JSON serialization of floats
+slim = tf.contrib.slim
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_string('model_dir',
+                           '/usr/share/apache-tika/models/dl/image-video/recognition/',
+                           """Path to inception_v4.ckpt & meta files""")
+tf.app.flags.DEFINE_integer('port',
+                            '8764',
+                            """Server PORT, default:8764""")
+tf.app.flags.DEFINE_string('log',
+                           'inception.log',
+                           """Log file name, default: inception.log""")
+
+
+def preprocess_image(image, height, width, central_fraction=0.875, scope=None):
+    """Prepare one image for evaluation.
+    If height and width are specified it would output an image with that size by
+    applying resize_bilinear.
+    If central_fraction is specified it would crop the central fraction of the
+    input image.
+    Args:
+      image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
+        [0, 1], otherwise it would converted to tf.float32 assuming that the range
+        is [0, MAX], where MAX is largest positive representable number for
+        int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
+      height: integer
+      width: integer
+      central_fraction: Optional Float, fraction of the image to crop.
+      scope: Optional scope for name_scope.
+    Returns:
+      3-D float Tensor of prepared image.
+    """
+    with tf.name_scope(scope, 'eval_image', [image, height, width]):
+        if image.dtype != tf.float32:
+            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+        # Crop the central region of the image with an area containing 87.5% of
+        # the original image.
+        if central_fraction:
+            image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+        if height and width:
+            # Resize the image to the specified height and width.
+            image = tf.expand_dims(image, 0)
+            image = tf.image.resize_bilinear(image, [height, width],
+                                             align_corners=False)
+            image = tf.squeeze(image, [0])
+        image = tf.subtract(image, 0.5)
+        image = tf.multiply(image, 2.0)
+        return image
+
+
+def create_readable_names_for_imagenet_labels():
+    """
+        Create a dict mapping label id to human readable string.
+        Returns:
+            labels_to_names: dictionary where keys are integers from to 1000
+            and values are human-readable names.
+
+        We retrieve a synset file, which contains a list of valid synset labels used
+        by ILSVRC competition. There is one synset one per line, eg.
+                #   n01440764
+                #   n01443537
+        We also retrieve a synset_to_human_file, which contains a mapping from synsets
+        to human-readable names for every synset in Imagenet. These are stored in a
+        tsv format, as follows:
+                #   n02119247    black fox
+                #   n02119359    silver fox
+        We assign each synset (in alphabetical order) an integer, starting from 1
+        (since 0 is reserved for the background class).
+
+        Code is based on
+        https://github.com/tensorflow/models/blob/master/inception/inception/data/build_imagenet_data.py
+    """
+
+    dest_directory = FLAGS.model_dir
+
+    synset_list = [s.strip() for s in open(os.path.join(dest_directory, 'imagenet_lsvrc_2015_synsets.txt')).readlines()]
+    num_synsets_in_ilsvrc = len(synset_list)
+    assert num_synsets_in_ilsvrc == 1000
+
+    synset_to_human_list = open(os.path.join(dest_directory, 'imagenet_metadata.txt')).readlines()
+    num_synsets_in_all_imagenet = len(synset_to_human_list)
+    assert num_synsets_in_all_imagenet == 21842
+
+    synset_to_human = {}
+    for s in synset_to_human_list:
+        parts = s.strip().split('\t')
+        assert len(parts) == 2
+        synset = parts[0]
+        human = parts[1]
+        synset_to_human[synset] = human
+
+    label_index = 1
+    labels_to_names = {0: 'background'}
+    for synset in synset_list:
+        name = synset_to_human[synset]
+        labels_to_names[label_index] = name
+        label_index += 1
+
+    return labels_to_names
+
+
+def get_remote_file(url, success=200, timeout=10):
+    """
+        Given HTTP URL, this api gets the content of it
+        returns (Content-Type, image_content)
+    """
+    try:
+        app.logger.info("GET: %s" % url)
+        auth = None
+        res = requests.get(url, stream=True, timeout=timeout, auth=auth)
+        if res.status_code == success:
+            return res.headers.get('Content-Type', 'application/octet-stream'), res.raw.data
+    except:
+        pass
+    return None, None
+
+
+def current_time():
+    """Returns current time in milli seconds"""
+
+    return int(1000 * time())
+
+
+class Classifier(Flask):
+    """Classifier Service class"""
+
+    def __init__(self, name):
+        super(Classifier, self).__init__(name)
+        file_handler = RotatingFileHandler(FLAGS.log, maxBytes=1024 * 1024 * 100, backupCount=20)
+        file_handler.setLevel(logging.INFO)
+        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+        file_handler.setFormatter(formatter)
+        self.logger.addHandler(file_handler)
+        self.names = create_readable_names_for_imagenet_labels()
+        self.image_size = default_image_size
+
+        self.image_str_placeholder = tf.placeholder(tf.string)
+        image = tf.image.decode_jpeg(self.image_str_placeholder, channels=3)
+        processed_image = preprocess_image(image, self.image_size, self.image_size)
+        processed_images = tf.expand_dims(processed_image, 0)
+        # create the model, use the default arg scope to configure the batch norm parameters.
+        with slim.arg_scope(inception_v4_arg_scope()):
+            logits, _ = inception_v4(processed_images, num_classes=1001, is_training=False)
+        self.probabilities = tf.nn.softmax(logits)
+
+        dest_directory = FLAGS.model_dir
+        init_fn = slim.assign_from_checkpoint_fn(
+            os.path.join(dest_directory, 'inception_v4.ckpt'),
+            slim.get_model_variables('InceptionV4'))
+
+        self.sess = tf.Session()
+        init_fn(self.sess)
+
+    def classify(self, image_string, topn, min_confidence):
+        eval_probabilities = self.sess.run(self.probabilities, feed_dict={self.image_str_placeholder: image_string})
+        eval_probabilities = eval_probabilities[0, 0:]
+        sorted_inds = [i[0] for i in sorted(enumerate(-eval_probabilities), key=lambda x: x[1])]
+
+        if topn is None:
+            topn = len(sorted_inds)
+
+        res = []
+        for i in range(topn):
+            index = sorted_inds[i]
+            score = float(eval_probabilities[index])
+            if min_confidence is None:
+                res.append((index, self.names[index], score))
+            else:
+                if score >= min_confidence:
+                    res.append((index, self.names[index], score))
+                else:
+                    # the scores are in sorted order, so we can break the loop whenever we get a low score object
+                    break
+        return res
+
+
+app = Classifier(__name__)
+
+
+@app.route("/")
+def index():
+    """The index page which provide information about other API end points"""
+
+    return """
+    <div>
+    <h1> Inception REST API </h1>
+    <h3> The following API end points are valid </h3>
+        <ul>
+            <h4> Inception V4 </h4>
+            <li> <code>/inception/v4/ping </code> - <br/>
+                <b> Description : </b> checks availability of the service. returns "pong" with status 200 when it is available
+            </li>
+            <li> <code>/inception/v4/classify/image</code> - <br/>
+                <table>
+                <tr><th align="left"> Description </th><td> This is a classifier service that can classify images</td></tr>
+                <tr><td></td> <td>Query Params : <br/>
+                   <code>topn </code>: type = int : top classes to get; default : 5 <br/>
+                   <code>min_confidence </code>: type = float : minimum confidence that a label should have to exist in topn; default : 0.015 <br/>
+                   <code>human </code>: type = boolean : human readable class names; default : true <br/>
+                 </td></tr>
+                <tr><th align="left"> How to supply Image Content </th></tr>
+                <tr><th align="left"> With HTTP GET : </th> <td>
+                    Include a query parameter <code>url </code> which is an http url of JPEG image <br/>
+                    Example: <code> curl "localhost:8764/inception/v4/classify/image?url=http://xyz.com/example.jpg"</code>
+                </td></tr>
+                <tr><th align="left"> With HTTP POST :</th><td>
+                    POST JPEG image content as binary data in request body. <br/>
+                    Example: <code> curl -X POST "localhost:8764/inception/v4/classify/image?topn=5&min_confidence=0.015&human=false" --data-binary @example.jpg </code>
+                </td></tr>
+                </table>
+            </li>
+            <li> <code>/inception/v4/classify/video</code> - <br/>
+                <table>
+                <tr><th align="left"> Description </th><td> This is a classifier service that can classify videos</td></tr>
+                <tr><td></td> <td>Query Params : <br/>
+                   <code>topn </code>: type = int : top classes to get; default : 5 <br/>
+                   <code>min_confidence </code>: type = float : minimum confidence that a label should have to exist in topn; default : 0.015 <br/>
+                   <code>human </code>: type = boolean : human readable class names; default : true <br/>
+                   <code>mode </code>: options = <code>{"center", "interval", "fixed"}</code> : Modes of frame extraction; default : center <br/>
+                    &emsp; <code>"center"</code> - Just one frame in center. <br/>
+                    &emsp; <code>"interval"</code> - Extracts frames after fixed interval. <br/>
+                    &emsp; <code>"fixed"</code> - Extract fixed number of frames.<br/>
+                   <code>frame-interval </code>: type = int : Interval for frame extraction to be used with INTERVAL mode. If frame_interval=10 then every 10th frame will be extracted; default : 10 <br/>
+                   <code>num-frame </code>: type = int : Number of frames to be extracted from video while using FIXED model. If num_frame=10 then 10 frames equally distant from each other will be extracted; default : 10 <br/>
+
+                 </td></tr>
+                <tr><th align="left"> How to supply Video Content </th></tr>
+                <tr><th align="left"> With HTTP GET : </th> <td>
+                    Include a query parameter <code>url </code> which is path on file system <br/>
+                    Example: <code> curl "localhost:8764/inception/v4/classify/video?url=filesystem/path/to/video"</code><br/>
+                </td></tr><br/>
+                <tr><th align="left"> With HTTP POST :</th><td>
+                    POST video content as binary data in request body. If video can be decoded by OpenCV it should be fine. It's tested on mp4 and avi on mac <br/>
+                    Include a query parameter <code>ext </code>this extension is needed to tell OpenCV which decoder to use, default is ".mp4" </br>
+                    Example: <code> curl -X POST "localhost:8764/inception/v4/classify/video?topn=5&min_confidence=0.015&human=false" --data-binary @example.mp4 </code>
+                </td></tr>
+                </table>
+            </li>
+        <ul>
+    </div>
+    """
+
+
+@app.route("/inception/v4/ping", methods=["GET"])
+def ping_pong():
+    """API to do health check. If this says status code 200, then healthy"""
+
+    return "pong"
+
+
+@app.route("/inception/v4/classify/image", methods=["GET", "POST"])
+def classify_image():
+    """API to classify images"""
+
+    image_format = "not jpeg"
+
+    st = current_time()
+    topn = int(request.args.get("topn", "5"))
+    min_confidence = float(request.args.get("min_confidence", "0.015"))
+    human = request.args.get("human", "true").lower() in ("true", "1", "yes")
+    if request.method == 'POST':
+        image_data = request.get_data()
+    else:
+        url = request.args.get("url")
+        c_type, image_data = get_remote_file(url)
+        if not image_data:
+            return Response(status=400, response=jsonify(error="Could not HTTP GET %s" % url))
+        if 'image/jpeg' in c_type:
+            image_format = "jpeg"
+
+    # use c_type to find whether image_format is jpeg or not
+    # if jpeg, don't convert
+    if image_format == "jpeg":
+        jpg_image = image_data
+    # if not jpeg
+    else:
+        # open the image from raw bytes
+        image = Image.open(BytesIO(image_data))
+        # convert the image to RGB format, otherwise will give errors when converting to jpeg, if the image isn't RGB
+        rgb_image = image.convert("RGB")
+        # convert the RGB image to jpeg
+        image_bytes = BytesIO()
+        rgb_image.save(image_bytes, format="jpeg", quality=95)
+        jpg_image = image_bytes.getvalue()
+        image_bytes.close()
+
+    read_time = current_time() - st
+    st = current_time()  # reset start time
+    try:
+        classes = app.classify(image_string=jpg_image, topn=topn, min_confidence=min_confidence)
+    except Exception as e:
+        app.logger.error(e)
+        return Response(status=400, response=str(e))
+    classids, classnames, confidence = zip(*classes)
+
+    print(classnames, confidence)
+
+    classifier_time = current_time() - st
+    app.logger.info("Classifier time : %d" % classifier_time)
+    res = {
+        'classids': classids,
+        'confidence': confidence,
+        'time': {
+            'read': read_time,
+            'classification': classifier_time,
+            'units': 'ms'
+        }
+    }
+    if human:
+        res['classnames'] = classnames
+    return Response(response=json.dumps(res), status=200, mimetype="application/json")
+
+
+@app.route("/inception/v4/classify/video", methods=["GET", "POST"])
+def classify_video():
+    """
+        API to classify videos
+        Request args -
+         url - PATH of file
+         topn - number of top scoring labels
+         min_confidence - minimum confidence that a label should have to exist in topn
+         human - human readable or not
+         mode - Modes of frame extraction {"center", "interval", "fixed"}
+            "center" - Just one frame in center. <Default option>
+            "interval" - Extracts frames after fixed interval.
+            "fixed" - Extract fixed number of frames.
+         frame-interval - Interval for frame extraction to be used with INTERVAL mode. If frame_interval=10 then every 10th frame will be extracted.
+         num-frame - Number of frames to be extracted from video while using FIXED model. If num_frame=10 then 10 frames equally distant from each other will be extracted
+
+         ext - If video is sent in binary format, then ext is needed to tell OpenCV which decoder to use. eg ".mp4"
+    """
+
+    st = current_time()
+    topn = int(request.args.get("topn", "5"))
+    min_confidence = float(request.args.get("min_confidence", "0.015"))
+    human = request.args.get("human", "true").lower() in ("true", "1", "yes")
+
+    mode = request.args.get("mode", "center").lower()
+    if mode not in {"center", "interval", "fixed"}:
+        '''
+        Throw invalid request error
+        '''
+        return Response(status=400, response=jsonify(error="not a valid mode. Available mode %s" % str(ALLOWED_MODE)))
+
+    frame_interval = int(request.args.get("frame-interval", "10"))
+    num_frame = int(request.args.get("num-frame", "10"))
+
+    if request.method == 'POST':
+        video_data = request.get_data()
+        ext = request.args.get("ext", ".mp4").lower()
+
+        temp_file = tempfile.NamedTemporaryFile(suffix=ext)
+        temp_file.file.write(video_data)
+        temp_file.file.close()
+
+        url = temp_file.name
+    else:
+        url = request.args.get("url")
+
+    read_time = current_time() - st
+    st = current_time()  # reset start time
+
+    if mode == "center":
+        image_data_arr = [get_center_frame(url)]
+    elif mode == "interval":
+        image_data_arr = get_frames_interval(url, frame_interval)
+    else:
+        image_data_arr = get_n_frames(url, num_frame)
+
+    classes = []
+    for image_data in image_data_arr:
+        try:
+            _classes = app.classify(image_data, topn=None, min_confidence=None)
+        except Exception as e:
+            app.logger.error(e)
+            return Response(status=400, response=str(e))
+
+        _classes.sort()
+        if len(classes) == 0:
+            classes = _classes
+        else:
+            for idx, _c in enumerate(_classes):
+                c = list(classes[idx])
+                c[2] += _c[2]
+                classes[idx] = tuple(c)
+
+    top_classes = []
+    for c in classes:
+        c = list(c)
+        # avg out confidence score
+        avg_score = c[2] / len(image_data_arr)
+        c[2] = avg_score
+        if avg_score >= min_confidence:
+            top_classes.append(tuple(c))
+
+    top_classes = sorted(top_classes, key=lambda tup: tup[2])[-topn:][::-1]
+
+    classids, classnames, confidence = zip(*top_classes)
+
+    classifier_time = current_time() - st
+    app.logger.info("Classifier time : %d" % classifier_time)
+    res = {
+        'classids': classids,
+        'confidence': confidence,
+        'time': {
+            'read': read_time,
+            'classification': classifier_time,
+            'units': 'ms'
+        }
+    }
+    if human:
+        res['classnames'] = classnames
+    return Response(response=json.dumps(res), status=200, mimetype="application/json")
+
+
+def main(_):
+    if not app.debug:
+        print("Serving on port %d" % FLAGS.port)
+    app.run(host="0.0.0.0", port=FLAGS.port)
+
+
+if __name__ == '__main__':
+    tf.app.run()