You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/01 15:23:47 UTC
[tika] 02/03: Git add files...no idea how this mv failed...
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3179
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 057186813a46b2d2a90a16945c1f5604d350caf5
Author: tallison <ta...@apache.org>
AuthorDate: Tue Sep 1 11:22:42 2020 -0400
Git add files...no idea how this mv failed...
---
.../tika/dl/imagerec/dl4j-inception3-config.xml | 35 +
.../apache/tika/dl/imagerec/dl4j-vgg16-config.xml | 32 +
.../parser/recognition/AgeRecogniserConfig.java | 69 ++
.../apache/tika/parser/captioning/tf/im2txtapi.py | 266 +++++++
.../parser/recognition/tika-config-tflow-rest.xml | 33 +
.../recognition/ObjectRecognitionParserTest.java | 199 +++++
.../tf/TensorflowVideoRecParserTest.java | 55 ++
.../parser/ctakes/CTAKESAnnotationProperty.java | 46 ++
.../tika/parser/geo/NameEntityExtractor.java | 122 +++
.../tika/parser/journal/GrobidRESTParser.java | 116 +++
.../apache/tika/parser/journal/TEIDOMParser.java | 882 +++++++++++++++++++++
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 147 ++++
.../services/org.apache.tika.parser.Parser | 18 +
.../tika/parser/ner/grobid/GrobidServer.properties | 17 +
.../org/apache/tika/parser/journal/TEITest.java | 69 ++
.../tika/parser/ner/NamedEntityParserTest.java | 91 +++
.../tika/parser/ner/nltk/NLTKNERecogniserTest.java | 49 ++
.../apache/tika/parser/ner/opennlp/get-models.sh | 26 +
.../apache/tika/parser/ner/regex/tika-config.xml | 27 +
19 files changed, 2299 insertions(+)
diff --git a/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
new file mode 100644
index 0000000..2728063
--- /dev/null
+++ b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+ <mime>image/jpeg</mime>
+ <params>
+ <param name="modelWeightsPath" type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/inception_v3_keras_2.h5</param>
+ <param name="labelFile" type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/imagenet_class_index.json</param>
+ <param name="topN" type="int">10</param>
+ <param name="minConfidence" type="double">0.015</param>
+ <param name="class" type="string">org.apache.tika.dl.imagerec.DL4JInceptionV3Net</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
new file mode 100644
index 0000000..940a4b6
--- /dev/null
+++ b/tika-parsers-advanced/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+ <mime>image/jpeg</mime>
+ <params>
+ <param name="topN" type="int">3</param>
+ <param name="minConfidence" type="double">0.015</param>
+ <param name="class" type="string">org.apache.tika.dl.imagerec.DL4JVGG16Net</param>
+ <param name="modelType" type="string">VGG16</param>
+ <param name="serialize" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java b/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
new file mode 100644
index 0000000..92427f4
--- /dev/null
+++ b/tika-parsers-advanced/tika-nlp/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition;
+
+import java.net.URL;
+import java.util.Map;
+
+import org.apache.tika.config.Param;
+
+
+/**
+ * Stores URL for AgePredictor
+ */
+public class AgeRecogniserConfig {
+
+ private String pathClassifyModel = null;
+ private String pathClassifyRegression = null;
+
+ public AgeRecogniserConfig(Map<String, Param> params) {
+
+ URL classifyUrl = AgeRecogniserConfig.class.getResource(
+ params.get("age.path.classify").getValue().toString());
+
+ if (classifyUrl != null) {
+ setPathClassifyModel(classifyUrl.getFile());
+ }
+
+ URL regressionUrl = AgeRecogniserConfig.class.getResource(
+ params.get("age.path.regression").getValue().toString());
+
+ if (regressionUrl != null) {
+ setPathClassifyRegression(regressionUrl.getFile());
+ }
+ }
+
+ public String getPathClassifyModel() {
+ return pathClassifyModel;
+ }
+
+ public void setPathClassifyModel(String pathClassifyModel) {
+ this.pathClassifyModel = pathClassifyModel;
+ }
+
+ public String getPathClassifyRegression() {
+ return pathClassifyRegression;
+ }
+
+ public void setPathClassifyRegression(String pathClassifyRegression) {
+ this.pathClassifyRegression = pathClassifyRegression;
+ }
+
+
+
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py
new file mode 100644
index 0000000..97f1f2a
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/captioning/tf/im2txtapi.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+ This script exposes image captioning service over a REST API. Image captioning implementation based on the paper,
+
+ "Show and Tell: A Neural Image Caption Generator"
+ Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan
+
+ For more details, please visit :
+ http://arxiv.org/abs/1411.4555
+ Requirements :
+ Flask
+ tensorflow
+ numpy
+ requests
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import logging
+import math
+import requests
+import sys
+
+from flask import Flask, request, Response, jsonify
+from io import BytesIO
+from PIL import Image
+from time import time
+
+import tensorflow as tf
+import xml.etree.ElementTree as ET
+
+import model_wrapper
+import vocabulary
+import caption_generator
+
+# turning off the traceback by limiting its depth
+sys.tracebacklimit = 0
+
+# informative log messages for advanced users to troubleshoot errors when modifying model_info.xml
+try:
+ info = ET.parse('/usr/share/apache-tika/models/dl/image/caption/model_info.xml').getroot()
+except IOError:
+ logging.exception('model_info.xml is not found')
+ sys.exit(1)
+
+model_main = info.find('model_main')
+if model_main is None:
+ logging.exception('<checkpoint_path> tag under <model_main> tag in model_info.xml is not found')
+ sys.exit(1)
+
+checkpoint_path = model_main.find('checkpoint_path')
+if checkpoint_path is None:
+ logging.exception('<checkpoint_path> tag under <model_main> tag in model_info.xml is not found')
+ sys.exit(1)
+else:
+ checkpoint_path = checkpoint_path.text
+
+vocab_file = model_main.find('vocab_file')
+if vocab_file is None:
+ logging.exception('<vocab_file> tag under <model_main> tag in model_info.xml is not found')
+ sys.exit(1)
+else:
+ vocab_file = vocab_file.text
+
+port = info.get('port')
+if port is None:
+ logging.exception('port attribute in <service> tag in model_info.xml is not found')
+ sys.exit(1)
+
+# turning on the traceback by setting it to default
+sys.tracebacklimit = 1000
+
+FLAGS = tf.flags.FLAGS
+tf.flags.DEFINE_string("checkpoint_path", checkpoint_path, """Directory containing the model checkpoint file.""")
+tf.flags.DEFINE_string('vocab_file', vocab_file, """Text file containing the vocabulary.""")
+tf.flags.DEFINE_integer('port', port, """Server PORT, default:8764""")
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+
+class Initializer(Flask):
+ """
+ Class to initialize the REST API, this class loads the model from the given checkpoint path in model_info.xml
+ and prepares a caption_generator object
+ """
+
+ def __init__(self, name):
+ super(Initializer, self).__init__(name)
+ # build the inference graph
+ g = tf.Graph()
+ with g.as_default():
+ model = model_wrapper.ModelWrapper()
+ restore_fn = model.build_graph(FLAGS.checkpoint_path)
+ g.finalize()
+ # make the model globally available
+ self.model = model
+ # create the vocabulary
+ self.vocab = vocabulary.Vocabulary(FLAGS.vocab_file)
+ self.sess = tf.Session(graph=g)
+ # load the model from checkpoint
+ restore_fn(self.sess)
+
+
+def current_time():
+ """Returns current time in milli seconds"""
+
+ return int(1000 * time())
+
+
+app = Initializer(__name__)
+
+
+def get_remote_file(url, success=200, timeout=10):
+ """
+ Given HTTP URL, this api gets the content of it
+ returns (Content-Type, image_content)
+ """
+ try:
+ app.logger.info("GET: %s" % url)
+ auth = None
+ res = requests.get(url, stream=True, timeout=timeout, auth=auth)
+ if res.status_code == success:
+ return res.headers.get('Content-Type', 'application/octet-stream'), res.raw.data
+ except:
+ pass
+ return None, None
+
+
+@app.route("/")
+def index():
+ """The index page which provide information about other API end points"""
+
+ return """
+ <div>
+ <h1> Image Captioning REST API </h1>
+ <h3> The following API end points are valid </h3>
+ <ul>
+ <h4> Inception V3 </h4>
+ <li> <code>/inception/v3/ping </code> - <br/>
+ <b> Description : </b> checks availability of the service. returns "pong" with status 200 when it is available
+ </li>
+ <li> <code>/inception/v3/caption/image</code> - <br/>
+ <table>
+ <tr><th align="left"> Description </th><td> This is a service that can caption images</td></tr>
+ <tr><th align="left"> How to supply Image Content </th></tr>
+ <tr><th align="left"> With HTTP GET : </th> <td>
+ Include a query parameter <code>url </code> which is an http url of JPEG image <br/>
+ Example: <code> curl "localhost:8764/inception/v3/caption/image?url=http://xyz.com/example.jpg"</code>
+ </td></tr>
+ <tr><th align="left"> With HTTP POST :</th><td>
+ POST JPEG image content as binary data in request body. <br/>
+ Example: <code> curl -X POST "localhost:8764/inception/v3/caption/image" --data-binary @example.jpg </code>
+ </td></tr>
+ </table>
+ </li>
+ <ul>
+ </div>
+ """
+
+
+@app.route("/inception/v3/ping", methods=["GET"])
+def ping_pong():
+ """API to do health check. If this says status code 200, then healthy"""
+
+ return "pong"
+
+
+@app.route("/inception/v3/caption/image", methods=["GET", "POST"])
+def caption_image():
+ """API to caption images"""
+ image_format = "not jpeg"
+
+ st = current_time()
+ # get beam_size
+ beam_size = int(request.args.get("beam_size", "3"))
+ # get max_caption_length
+ max_caption_length = int(request.args.get("max_caption_length", "20"))
+ # get image_data
+ if request.method == 'POST':
+ image_data = request.get_data()
+ else:
+ url = request.args.get("url")
+ c_type, image_data = get_remote_file(url)
+ if not image_data:
+ return Response(status=400, response=jsonify(error="Could not HTTP GET %s" % url))
+ if 'image/jpeg' in c_type:
+ image_format = "jpeg"
+
+ # use c_type to find whether image_format is jpeg or not
+ # if jpeg, don't convert
+ if image_format == "jpeg":
+ jpg_image = image_data
+ # if not jpeg
+ else:
+ # open the image from raw bytes
+ image = Image.open(BytesIO(image_data))
+ # convert the image to RGB format, otherwise will give errors when converting to jpeg, if the image isn't RGB
+ rgb_image = image.convert("RGB")
+ # convert the RGB image to jpeg
+ image_bytes = BytesIO()
+ rgb_image.save(image_bytes, format="jpeg", quality=95)
+ jpg_image = image_bytes.getvalue()
+ image_bytes.close()
+
+ read_time = current_time() - st
+ # restart counter
+ st = current_time()
+
+ generator = caption_generator.CaptionGenerator(app.model,
+ app.vocab,
+ beam_size=beam_size,
+ max_caption_length=max_caption_length)
+ captions = generator.beam_search(app.sess, jpg_image)
+
+ captioning_time = current_time() - st
+ app.logger.info("Captioning time : %d" % captioning_time)
+
+ array_captions = []
+ for caption in captions:
+ sentence = [app.vocab.id_to_word(w) for w in caption.sentence[1:-1]]
+ sentence = " ".join(sentence)
+ array_captions.append({
+ 'sentence': sentence,
+ 'confidence': math.exp(caption.logprob)
+ })
+
+ response = {
+ 'beam_size': beam_size,
+ 'max_caption_length': max_caption_length,
+ 'captions': array_captions,
+ 'time': {
+ 'read': read_time,
+ 'captioning': captioning_time,
+ 'units': 'ms'
+ }
+ }
+ return Response(response=json.dumps(response), status=200, mimetype="application/json")
+
+
+def main(_):
+ if not app.debug:
+ print("Serving on port %d" % FLAGS.port)
+ app.run(host="0.0.0.0", port=FLAGS.port)
+
+
+if __name__ == '__main__':
+ tf.app.run()
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
new file mode 100644
index 0000000..69a65d0
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/main/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+ <mime>image/jpeg</mime>
+ <mime>image/png</mime>
+ <mime>image/gif</mime>
+ <params>
+ <param name="apiBaseUri" type="uri">http://localhost:8764/inception/v4</param>
+ <param name="topN" type="int">2</param>
+ <param name="minConfidence" type="double">0.015</param>
+ <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
new file mode 100644
index 0000000..25520af
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.List;
+
+/**
+ * Testcases for Object Recognition Parser
+ */
+public class ObjectRecognitionParserTest {
+
+ // Config files
+ private static final String CONFIG_FILE_OBJ_REC = "org/apache/tika/parser/recognition/tika-config-tflow.xml";
+ private static final String CONFIG_REST_FILE_OBJ_REC = "org/apache/tika/parser/recognition/tika-config-tflow-rest.xml";
+ private static final String CONFIG_REST_FILE_IM2TXT = "org/apache/tika/parser/recognition/tika-config-tflow-im2txt-rest.xml";
+
+ // Test images
+ private static final String CAT_IMAGE_JPEG = "test-documents/testJPEG.jpg";
+ private static final String CAT_IMAGE_PNG = "test-documents/testPNG.png";
+ private static final String CAT_IMAGE_GIF = "test-documents/testGIF.gif";
+
+ private static final String BASEBALL_IMAGE_JPEG = "test-documents/baseball.jpg";
+ private static final String BASEBALL_IMAGE_PNG = "test-documents/baseball.png";
+ private static final String BASEBALL_IMAGE_GIF = "test-documents/baseball.gif";
+
+ private static final ClassLoader loader = ObjectRecognitionParserTest.class.getClassLoader();
+
+ private static final Logger LOG = LoggerFactory.getLogger(ObjectRecognitionParserTest.class);
+
+ @Test
+ public void jpegTFObjRecTest() throws IOException, TikaException, SAXException {
+ TensorflowImageRecParser p = new TensorflowImageRecParser();
+ Assume.assumeTrue(p.isAvailable());
+ try (InputStream stream = loader.getResourceAsStream(CONFIG_FILE_OBJ_REC)) {
+ assert stream != null;
+ Tika tika = new Tika(new TikaConfig(stream));
+ Metadata metadata = new Metadata();
+ try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE_JPEG)) {
+ Reader reader = tika.parse(imageStream, metadata);
+ List<String> lines = IOUtils.readLines(reader);
+ String text = StringUtils.join(lines, " ");
+ String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+ String metaValues = StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY_OBJ_REC), " ");
+ for (String expectedObject : expectedObjects) {
+ String message = "'" + expectedObject + "' must have been detected";
+ Assert.assertTrue(message, text.contains(expectedObject));
+ Assert.assertTrue(message, metaValues.contains(expectedObject));
+ }
+ }
+ }
+ }
+
+ @Test
+ public void jpegRESTObjRecTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v4/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+ doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_JPEG,
+ ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+ }
+
+ @Test
+ public void pngRESTObjRecTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v4/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedObjects = {"Egyptian cat", "tabby, tabby cat"};
+ doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_PNG,
+ ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+ }
+
+ @Test
+ public void gifRESTObjRecTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v4/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedObjects = {"Egyptian cat"};
+ doRecognize(CONFIG_REST_FILE_OBJ_REC, CAT_IMAGE_GIF,
+ ObjectRecognitionParser.MD_KEY_OBJ_REC, expectedObjects);
+ }
+
+ @Test
+ public void jpegRESTim2txtTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v3/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedCaption = {"a baseball player holding a bat on a field"};
+ doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_JPEG,
+ ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+ }
+
+ @Test
+ public void pngRESTim2txtTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v3/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedCaption = {"a baseball player holding a bat on a field"};
+ doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_PNG,
+ ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+ }
+
+ @Test
+ public void gifRESTim2txtTest() throws Exception {
+ String apiUrl = "http://localhost:8764/inception/v3/ping";
+ boolean available = false;
+ int status = 500;
+ try{
+ status = WebClient.create(apiUrl).get().getStatus();
+ available = status == 200;
+ }
+ catch(Exception ignore){}
+ Assume.assumeTrue(available);
+ String[] expectedCaption = {"a baseball player pitching a ball on top of a field"};
+ doRecognize(CONFIG_REST_FILE_IM2TXT, BASEBALL_IMAGE_GIF,
+ ObjectRecognitionParser.MD_KEY_IMG_CAP, expectedCaption);
+ }
+
+ private void doRecognize(String configFile, String testImg, String mdKey, String[] expectedObjects) throws Exception {
+ try (InputStream stream = loader.getResourceAsStream(configFile)) {
+ assert stream != null;
+ Tika tika = new Tika(new TikaConfig(stream));
+ Metadata metadata = new Metadata();
+ try (InputStream imageStream = loader.getResourceAsStream(testImg)) {
+ Reader reader = tika.parse(imageStream, metadata);
+ String text = IOUtils.toString(reader);
+ String metaValues = StringUtils.join(metadata.getValues(mdKey), " ");
+ LOG.info("MetaValues = {}", metaValues);
+ for (String expectedObject : expectedObjects) {
+ String message = "'" + expectedObject + "' must have been detected";
+ Assert.assertTrue(message, text.contains(expectedObject));
+ Assert.assertTrue(message, metaValues.contains(expectedObject));
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java
new file mode 100644
index 0000000..ded686a
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-advancedmedia-module/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowVideoRecParserTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+@Ignore
+public class TensorflowVideoRecParserTest {
+
+ @Test
+ public void recognise() throws Exception {
+ TensorflowRESTVideoRecogniser recogniser = new TensorflowRESTVideoRecogniser();
+ recogniser.initialize(new HashMap<String, Param>());
+ try (InputStream stream = getClass().getClassLoader().getResourceAsStream("test-documents/testVideoMp4.mp4")) {
+ List<RecognisedObject> objects = recogniser.recognise(stream, new DefaultHandler(), new Metadata(), new ParseContext());
+
+ Assert.assertTrue(objects.size() > 0);
+ Set<String> objectLabels = new HashSet<>();
+ for (RecognisedObject object : objects) {
+ objectLabels.add(object.getLabel());
+ }
+ Assert.assertTrue(objectLabels.size() > 0);
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
new file mode 100644
index 0000000..1c1be02
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@link IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+ BEGIN("start"),
+ END("end"),
+ CONDITIONAL("conditional"),
+ CONFIDENCE("confidence"),
+ DISCOVERY_TECNIQUE("discoveryTechnique"),
+ GENERIC("generic"),
+ HISTORY_OF("historyOf"),
+ ID("id"),
+ ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+ POLARITY("polarity");
+
+ private String name;
+
+ CTAKESAnnotationProperty(String name) {
+ this.name = name;
+ }
+
+ public String getName() {
+ return name;
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java
new file mode 100644
index 0000000..c998e40
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/NameEntityExtractor.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.Span;
+
+public class NameEntityExtractor {
+ ArrayList<String> locationNameEntities;
+ String bestNameEntity;
+ private HashMap<String, Integer> tf;
+ private final NameFinderME nameFinder;
+
+ public NameEntityExtractor(NameFinderME nameFinder) throws IOException {
+ this.locationNameEntities = new ArrayList<String>();
+ this.bestNameEntity = null;
+ this.nameFinder = nameFinder;
+ this.tf = new HashMap<String, Integer>();
+ }
+
+ /*
+ * Use OpenNLP to extract location names that's appearing in the steam.
+ * OpenNLP's default Name Finder accuracy is not very good, please refer to
+ * its documentation.
+ *
+ * @param stream stream that passed from this.parse()
+ */
+ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
+ String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+ Span nameE[];
+
+ //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+ synchronized (nameFinder) {
+ nameE = nameFinder.find(in);
+ //the same name finder is reused, so clear adaptive data
+ nameFinder.clearAdaptiveData();
+ }
+
+ String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+ spanNames = spanNames.substring(1, spanNames.length() - 1);
+ String[] tmp = spanNames.split(",");
+
+ for (String name : tmp) {
+ name = name.trim();
+ this.locationNameEntities.add(name);
+ }
+
+
+ }
+
+ /*
+ * Get the best location entity extracted from the input stream. Simply
+ * return the most frequent entity, If there several highest frequent
+ * entity, pick one randomly. May not be the optimal solution, but works.
+ *
+ * @param locationNameEntities OpenNLP name finder's results, stored in
+ * ArrayList
+ */
+ public void getBestNameEntity() {
+ if (this.locationNameEntities.size() == 0)
+ return;
+
+ for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+ if (tf.containsKey(this.locationNameEntities.get(i)))
+ tf.put(this.locationNameEntities.get(i),
+ tf.get(this.locationNameEntities.get(i)) + 1);
+ else
+ tf.put(this.locationNameEntities.get(i), 1);
+ }
+ int max = 0;
+ List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+ tf.entrySet());
+ Collections.shuffle(list);
+ Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+ public int compare(Map.Entry<String, Integer> o1,
+ Map.Entry<String, Integer> o2) {
+ // Descending Order
+ return o2.getValue().compareTo(o1.getValue());
+ }
+ });
+
+ this.locationNameEntities.clear();// update so that they are in
+ // descending order
+ for (Map.Entry<String, Integer> entry : list) {
+ this.locationNameEntities.add(entry.getKey());
+ if (entry.getValue() > max) {
+ max = entry.getValue();
+ this.bestNameEntity = entry.getKey();
+ }
+ }
+ }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
new file mode 100644
index 0000000..110c504
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
+
+
+ private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+ private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+ // doesn't work
+ // nfc why
+
+ private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+
+ private String restHostUrlStr;
+
+ public GrobidRESTParser() {
+ String restHostUrlStr = null;
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ LOG.warn("can't read rest url", e);
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = GROBID_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+ }
+
+ public void parse(String filePath, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws FileNotFoundException {
+
+ File pdfFile = new File(filePath);
+ ContentDisposition cd = new ContentDisposition(
+ "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+ Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+ MultipartBody body = new MultipartBody(att);
+
+ Response response = WebClient
+ .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+ .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+ .post(body);
+
+ try {
+ String resp = response.readEntity(String.class);
+ Metadata teiMet = new TEIDOMParser().parse(resp, context);
+ for (String key : teiMet.names()) {
+ metadata.add("grobid:header_" + key, teiMet.get(key));
+ }
+ } catch (Exception e) {
+ LOG.warn("Couldn't read response", e);
+ }
+ }
+
+ private static String readRestUrl() throws IOException {
+ Properties grobidProperties = new Properties();
+ grobidProperties.load(GrobidRESTParser.class
+ .getResourceAsStream("GrobidExtractor.properties"));
+
+ return grobidProperties.getProperty("grobid.server.url");
+ }
+
+ protected static boolean canRun() {
+ Response response = null;
+
+ try {
+ response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
+ .accept(MediaType.TEXT_HTML).get();
+ String resp = response.readEntity(String.class);
+ return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+ } catch (Exception e) {
+ //swallow...can't run
+ return false;
+ }
+ }
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
new file mode 100644
index 0000000..b79ec93
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
@@ -0,0 +1,882 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class TEIDOMParser {
+
+ public TEIDOMParser() {
+ }
+
+ public Metadata parse(String source, ParseContext parseContext) throws TikaException, SAXException, IOException {
+
+ Document root = XMLReaderUtils.buildDOM(
+ new ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8)), parseContext);
+
+ Metadata metadata = new Metadata();
+ createGrobidMetadata(source, root.getDocumentElement(), metadata);
+ return metadata;
+ }
+
+ private void createGrobidMetadata(String source, Element root,
+ Metadata metadata) {
+ if (root != null) {
+
+ Node text = getFirstChild(root.getChildNodes(), "text");
+ if (text != null) {
+ parseText(text, metadata);
+ }
+ Node teiHeader = getFirstChild(root.getChildNodes(), "teiHeader");
+ Node fileDesc = getFirstChild(teiHeader.getChildNodes(), "fileDesc");
+ if (fileDesc != null) {
+ parseFileDesc(fileDesc, metadata);
+
+ }
+ Node profileDesc = getFirstChild(teiHeader.getChildNodes(), "profileDesc");
+ if (profileDesc != null) {
+ parseProfileDesc(profileDesc, metadata);
+ }
+
+ }
+
+ addStaticMet(source, root, metadata);
+ }
+
+ private void addStaticMet(String source, Element obj, Metadata metadata) {
+ metadata.add("Class", Metadata.class.getName());
+ //no longer available after we got rid of json.org's and its .toJSONObject()
+// metadata.add("TEIJSONSource", obj.toString());
+ metadata.add("TEIXMLSource", source);
+ }
+
+ private void parseText(Node text, Metadata metadata) {
+ String lang = getFirstAttribute(text, "xml", "lang");
+ if (lang != null) {
+ metadata.add("Language", lang);
+ }
+ }
+
+ private void parseFileDesc(Node fileDesc, Metadata metadata) {
+ Node titleStmt = getFirstChild(fileDesc.getChildNodes(), "titleStmt");
+
+ if (titleStmt != null) {
+ parseTitleStmt(titleStmt, metadata);
+ }
+
+ Node sourceDesc = getFirstChild(fileDesc.getChildNodes(), "sourceDesc");
+ if (sourceDesc != null) {
+ parseSourceDesc(sourceDesc, metadata);
+ }
+ }
+
+ private void parseTitleStmt(Node titleStmt, Metadata metadata) {
+ Node title = getFirstChild(titleStmt.getChildNodes(), "title");
+ if (title != null) {
+ String titleText = title.getTextContent();
+ if (titleText != null) {
+ metadata.add("Title", titleText);
+ }
+ }
+ }
+
+ private void parseSourceDesc(Node sourceDesc, Metadata metadata) {
+ Node biblStruct = getFirstChild(sourceDesc.getChildNodes(), "biblStruct");
+ if (biblStruct != null) {
+ parseBiblStruct(biblStruct, metadata);
+ }
+ }
+
+ private void parseBiblStruct(Node biblStruct, Metadata metadata) {
+
+ Node analytic = getFirstChild(biblStruct.getChildNodes(), "analytic");
+ if (analytic != null) {
+ List<Node> authorNodes = getChildNodes(analytic.getChildNodes(), "author");
+ List<Author> authorList = new ArrayList<>();
+ for (Node authorNode : authorNodes) {
+ parseAuthor(authorNode, authorList);
+ }
+
+ metadata.add("Address", getMetadataAddresses(authorList));
+ metadata.add("Affiliation", getMetadataAffiliations(authorList));
+ metadata.add("Authors", getMetadataAuthors(authorList));
+ metadata.add("FullAffiliations",
+ getMetadataFullAffiliations(authorList));
+
+
+ } else {
+ metadata.add("Error", "Unable to parse: no analytic section in JSON");
+ }
+
+ }
+
+ private String getMetadataFullAffiliations(List<Author> authorList) {
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffils = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+ metAffils.append("[");
+ for (Affiliation af : unique) {
+ metAffils.append(af.toString());
+ metAffils.append(",");
+ }
+ metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+ metAffils.append("]");
+ return metAffils.toString();
+ }
+
+ private String getMetadataAuthors(List<Author> authorList) {
+ // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+ // Steve Hughes 1
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAuthors = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ for (Author a : authorList) {
+ metAuthors.append(printOrBlank(a.getFirstName()));
+ metAuthors.append(printOrBlank(a.getMiddleName()));
+ metAuthors.append(printOrBlank(a.getSurName()));
+
+ StringBuilder affilBuilder = new StringBuilder();
+ for (int idx = 0; idx < unique.size(); idx++) {
+ Affiliation af = unique.get(idx);
+ if (a.getAffiliations().contains(af)) {
+ affilBuilder.append((idx + 1));
+ affilBuilder.append(",");
+ }
+ }
+
+ if (affilBuilder.length() > 0)
+ affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+ metAuthors.append(affilBuilder.toString());
+ metAuthors.append(" ");
+ }
+
+ return metAuthors.toString();
+ }
+
+ private String getMetadataAffiliations(List<Author> authorList) {
+ // generates 1 Jet Propulsion Laboratory California Institute of Technology
+ // ; 2 Computer Science Department University of Southern California
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffil = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ int count = 1;
+ for (Affiliation a : unique) {
+ metAffil.append(count);
+ metAffil.append(" ");
+ metAffil.append(a.getOrgName().toString());
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.append("; ");
+ count++;
+ }
+
+ if (count > 1) {
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ }
+
+ return metAffil.toString();
+ }
+
+ private String getMetadataAddresses(List<Author> authorList) {
+ // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+ List<Address> unique = new ArrayList<Address>();
+ StringBuilder metAddress = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af.getAddress())) {
+ unique.add(af.getAddress());
+ }
+ }
+ }
+
+ for (Address ad : unique) {
+ metAddress.append(ad.toString());
+ metAddress.append(" ");
+ }
+
+ return metAddress.toString();
+ }
+
+ private void parseAuthor(Node authorNode, List<Author> authorList) {
+ Author author = new Author();
+ Node persName = getFirstChild(authorNode.getChildNodes(), "persName");
+ if (persName != null) {
+ List<Node> forenames = getChildNodes(persName.getChildNodes(), "forename");
+ for (Node forenameNode : forenames) {
+ parseNamePart(forenameNode, author);
+ }
+ Node surnameNode = getFirstChild(persName.getChildNodes(), "surname");
+ if (surnameNode != null) {
+ String surnameContent = surnameNode.getTextContent();
+ if (surnameContent != null) {
+ author.setSurName(surnameContent);
+ }
+ }
+ }
+ List<Node> affiliationNodes = getChildNodes(authorNode.getChildNodes(), "affiliation");
+ for (Node affiliationNode : affiliationNodes) {
+ parseOneAffiliation(affiliationNode, author);
+ }
+
+
+ authorList.add(author);
+ }
+
+ private void parseNamePart(Node namePart, Author author) {
+ String type = getFirstAttribute(namePart, null, "type");
+ String content = namePart.getTextContent();
+ if (type != null && content != null) {
+
+ if (type.equals("first")) {
+ author.setFirstName(content);
+ }
+
+ if (type.equals("middle")) {
+ author.setMiddleName(content);
+ }
+ }
+ }
+
+ private void parseOneAffiliation(Node affiliationNode, Author author) {
+
+ Affiliation affiliation = new Affiliation();
+ Node address = getFirstChild(affiliationNode.getChildNodes(), "address");
+ if (address != null) {
+ parseAddress(address, affiliation);
+ }
+
+ List<Node> orgNameNodes = getChildNodes(affiliationNode.getChildNodes(), "orgName");
+ OrgName orgName = new OrgName();
+ for (Node orgNameNode : orgNameNodes) {
+ parseOrgName(orgNameNode, orgName);
+ }
+ affiliation.setOrgName(orgName);
+
+ author.getAffiliations().add(affiliation);
+ }
+
+ private void parseAddress(Node addressNode, Affiliation affiliation) {
+ Address address = new Address();
+ Node region = getFirstChild(addressNode.getChildNodes(), "region");
+ if (region != null && region.getTextContent() != null) {
+ address.setRegion(region.getTextContent());
+ }
+ Node postCode = getFirstChild(addressNode.getChildNodes(), "postCode");
+ if (postCode != null && postCode.getTextContent() != null) {
+ address.setPostCode(postCode.getTextContent());
+ }
+ Node settlementNode = getFirstChild(addressNode.getChildNodes(), "settlement");
+ if (settlementNode != null && settlementNode.getTextContent() != null) {
+ address.setSettlment(settlementNode.getTextContent());
+ }
+
+ Node countryNode = getFirstChild(addressNode.getChildNodes(), "country");
+ if (countryNode != null) {
+ Country country = new Country();
+ String key = getFirstAttribute(countryNode, null, "key");
+ if (key != null) {
+ country.setKey(key);
+ }
+ String content = countryNode.getTextContent();
+ if (content != null) {
+ country.setContent(content);
+ }
+ address.setCountry(country);
+ }
+
+ affiliation.setAddress(address);
+ }
+
+ private void parseOrgName(Node orgNode, OrgName orgName) {
+ OrgTypeName typeName = new OrgTypeName();
+ String orgContent = orgNode.getTextContent();
+ if (orgContent != null) {
+ typeName.setName(orgContent);
+ }
+ String orgType = getFirstAttribute(orgNode, null, "type");
+ if (orgType != null) {
+ typeName.setType(orgType);
+ }
+
+ orgName.getTypeNames().add(typeName);
+ }
+
+ private void parseProfileDesc(Node profileDesc, Metadata metadata) {
+ Node abstractNode = getFirstChild(profileDesc.getChildNodes(), "abstract");
+ if (abstractNode != null) {
+ Node pNode = getFirstChild(abstractNode.getChildNodes(), "p");
+ if (pNode != null) {
+ metadata.add("Abstract", pNode.getTextContent());
+ }
+ }
+
+ Node textClassNode = getFirstChild(profileDesc.getChildNodes(), "textClass");
+ if (textClassNode != null) {
+ Node keywordsNode = getFirstChild(textClassNode.getChildNodes(), "keywords");
+ if (keywordsNode != null) {
+ List<Node> terms = getChildNodes(keywordsNode.getChildNodes(), "term");
+ if (terms.size() == 0) {
+ // test AJ15.pdf
+ metadata.add("Keyword", keywordsNode.getTextContent());
+ } else {
+ for (Node term : terms) {
+ metadata.add("Keyword", term.getTextContent());
+ }
+ }
+
+ }
+ }
+
+ }
+
+ private String printOrBlank(String val) {
+ if (val != null && !val.equals("")) {
+ return val + " ";
+ } else
+ return " ";
+ }
+
+ class Author {
+
+ private String surName;
+
+ private String middleName;
+
+ private String firstName;
+
+ private List<Affiliation> affiliations;
+
+ public Author() {
+ this.surName = null;
+ this.middleName = null;
+ this.firstName = null;
+ this.affiliations = new ArrayList<Affiliation>();
+ }
+
+ /**
+ * @return the surName
+ */
+ public String getSurName() {
+ return surName;
+ }
+
+ /**
+ * @param surName the surName to set
+ */
+ public void setSurName(String surName) {
+ this.surName = surName;
+ }
+
+ /**
+ * @return the middleName
+ */
+ public String getMiddleName() {
+ return middleName;
+ }
+
+ /**
+ * @param middleName the middleName to set
+ */
+ public void setMiddleName(String middleName) {
+ this.middleName = middleName;
+ }
+
+ /**
+ * @return the firstName
+ */
+ public String getFirstName() {
+ return firstName;
+ }
+
+ /**
+ * @param firstName the firstName to set
+ */
+ public void setFirstName(String firstName) {
+ this.firstName = firstName;
+ }
+
+ /**
+ * @return the affiliations
+ */
+ public List<Affiliation> getAffiliations() {
+ return affiliations;
+ }
+
+ /**
+ * @param affiliations the affiliations to set
+ */
+ public void setAffiliations(List<Affiliation> affiliations) {
+ this.affiliations = affiliations;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+ : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+ + "]";
+ }
+
+ }
+
+ class Affiliation {
+
+ private OrgName orgName;
+
+ private Address address;
+
+ public Affiliation() {
+ this.orgName = new OrgName();
+ this.address = new Address();
+ }
+
+ /**
+ * @return the orgName
+ */
+ public OrgName getOrgName() {
+ return orgName;
+ }
+
+ /**
+ * @param orgName the orgName to set
+ */
+ public void setOrgName(OrgName orgName) {
+ this.orgName = orgName;
+ }
+
+ /**
+ * @return the address
+ */
+ public Address getAddress() {
+ return address;
+ }
+
+ /**
+ * @param address the address to set
+ */
+ public void setAddress(Address address) {
+ this.address = address;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Affiliation otherA = (Affiliation) obj;
+ return this.getAddress().equals(otherA.getAddress())
+ && this.getOrgName().equals(otherA.getOrgName());
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+ }
+
+ }
+
+ class OrgName {
+ private List<OrgTypeName> typeNames;
+
+ public OrgName() {
+ this.typeNames = new ArrayList<OrgTypeName>();
+ }
+
+ /**
+ * @return the typeNames
+ */
+ public List<OrgTypeName> getTypeNames() {
+ return typeNames;
+ }
+
+ /**
+ * @param typeNames the typeNames to set
+ */
+ public void setTypeNames(List<OrgTypeName> typeNames) {
+ this.typeNames = typeNames;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ for (OrgTypeName on : this.typeNames) {
+ builder.append(on.getName());
+ builder.append(" ");
+ }
+ return builder.toString();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgName otherA = (OrgName) obj;
+
+ if (otherA.getTypeNames() != null) {
+ if (this.typeNames == null) {
+ return false;
+ } else {
+ return this.typeNames.size() == otherA.getTypeNames().size();
+ }
+ } else {
+ if (this.typeNames == null) {
+ return true;
+ } else
+ return false;
+ }
+
+ }
+
+ }
+
+ class OrgTypeName {
+ private String name;
+ private String type;
+
+ public OrgTypeName() {
+ this.name = null;
+ this.type = null;
+ }
+
+ /**
+ * @return the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @param name the name to set
+ */
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * @return the type
+ */
+ public String getType() {
+ return type;
+ }
+
+ /**
+ * @param type the type to set
+ */
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgTypeName otherOrgName = (OrgTypeName) obj;
+ return this.type.equals(otherOrgName.getType())
+ && this.name.equals(otherOrgName.getName());
+ }
+
+ }
+
+ private class Address {
+
+ private String region;
+ private String postCode;
+ private String settlment;
+ private Country country;
+
+ public Address() {
+ this.region = null;
+ this.postCode = null;
+ this.settlment = null;
+ this.country = new Country();
+ }
+
+ /**
+ * @return the region
+ */
+ public String getRegion() {
+ return region;
+ }
+
+ /**
+ * @param region the region to set
+ */
+ public void setRegion(String region) {
+ this.region = region;
+ }
+
+ /**
+ * @return the postCode
+ */
+ public String getPostCode() {
+ return postCode;
+ }
+
+ /**
+ * @param postCode the postCode to set
+ */
+ public void setPostCode(String postCode) {
+ this.postCode = postCode;
+ }
+
+ /**
+ * @return the settlment
+ */
+ public String getSettlment() {
+ return settlment;
+ }
+
+ /**
+ * @param settlment the settlment to set
+ */
+ public void setSettlment(String settlment) {
+ this.settlment = settlment;
+ }
+
+ /**
+ * @return the country
+ */
+ public Country getCountry() {
+ return country;
+ }
+
+ /**
+ * @param country the country to set
+ */
+ public void setCountry(Country country) {
+ this.country = country;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Address otherA = (Address) obj;
+ if (this.settlment == null) {
+ return otherA.getSettlment() == null;
+ } else if (this.country == null) {
+ return otherA.getCountry() == null;
+ } else if (this.postCode == null) {
+ return otherA.getPostCode() == null;
+ } else if (this.region == null) {
+ return otherA.getRegion() == null;
+ }
+
+ return this.settlment.equals(otherA.getSettlment())
+ && this.country.equals(otherA.getCountry())
+ && this.postCode.equals(otherA.getPostCode())
+ && this.region.equals(otherA.getRegion());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(settlment);
+ builder.append(", ");
+ builder.append(region);
+ builder.append(" ");
+ builder.append(postCode);
+ builder.append(" ");
+ builder.append(country.getContent());
+ return builder.toString();
+ }
+ }
+
+ private class Country {
+ private String key;
+ private String content;
+
+ public Country() {
+ this.key = null;
+ this.content = null;
+ }
+
+ /**
+ * @return the key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * @param key the key to set
+ */
+ public void setKey(String key) {
+ this.key = key;
+ }
+
+ /**
+ * @return the content
+ */
+ public String getContent() {
+ return content;
+ }
+
+ /**
+ * @param content the content to set
+ */
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Country otherC = (Country) obj;
+
+ if (this.key == null) {
+ if (otherC.getKey() != null) {
+ return false;
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ return content.equals(otherC.getContent());
+ }
+ }
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return this.key.equals(otherC.getKey());
+ }
+ } else {
+ return this.key.equals(otherC.getKey())
+ && this.content.equals(otherC.getContent());
+ }
+ }
+ }
+ }
+
+ //returns first child with this name, null otherwise
+ private static Node getFirstChild(NodeList childNodes, String name) {
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ Node n = childNodes.item(i);
+ if (n.getNodeName().equals(name)) {
+ return n;
+ }
+ }
+ return null;
+ }
+
+ private static String getFirstAttribute(Node node, String ns, String name) {
+ if (node.hasAttributes()) {
+ NamedNodeMap attrs = node.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ if (attr.getLocalName().equals(name)) {
+ return attr.getNodeValue();
+ }
+ }
+ }
+ return null;
+ }
+
+ private static List<Node> getChildNodes(NodeList childNodes, String localName) {
+ List<Node> ret = new ArrayList<>();
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ Node child = childNodes.item(i);
+ if (child.getLocalName() != null && child.getLocalName().equals(localName)) {
+ ret.add(child);
+ }
+ }
+ return ret;
+ }
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..0e3d2d7
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collection;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Properties;
+import java.util.Iterator;
+import java.util.Locale;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * ne_chunk() module of NLTK. This NER requires additional setup,
+ * due to Http requests to an endpoint server that runs NLTK.
+ * See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+ private static boolean available = false;
+ private static final String NLTK_REST_HOST = "http://localhost:8881";
+ private String restHostUrlStr;
+ /**
+ * some common entities identified by NLTK
+ */
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add("NAMES");
+ }};
+
+
+ public NLTKNERecogniser(){
+ try {
+
+ String restHostUrlStr="";
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ LOG.warn("Can't read rest url", e);
+ }
+
+ if (restHostUrlStr == null || restHostUrlStr.equals("")) {
+ this.restHostUrlStr = NLTK_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+
+ Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+ int responseCode = response.getStatus();
+ if(responseCode == 200){
+ available = true;
+ }
+ else{
+ LOG.info("NLTKRest Server is not running");
+ }
+
+ } catch (Exception e) {
+ LOG.warn(e.getMessage(), e);
+ }
+ }
+
+ private static String readRestUrl() throws IOException {
+ Properties nltkProperties = new Properties();
+ nltkProperties.load(NLTKNERecogniser.class
+ .getResourceAsStream("NLTKServer.properties"));
+
+ return nltkProperties.getProperty("nltk.server.url");
+ }
+
+ /**
+ * @return {@code true} if server endpoint is available.
+ * returns {@code false} if server endpoint is not avaliable for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> entities = new HashMap<>();
+ try {
+ String url = restHostUrlStr + "/nltk";
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+ int responseCode = response.getStatus();
+ if (responseCode == 200) {
+ String result = response.readEntity(String.class);
+ JSONParser parser = new JSONParser();
+ JSONObject j = (JSONObject) parser.parse(result);
+ Iterator<?> keys = j.keySet().iterator();
+ while( keys.hasNext() ) {
+ String key = (String)keys.next();
+ if ( !key.equals("result") ) {
+ ENTITY_TYPES.add(key);
+ entities.put(key.toUpperCase(Locale.ENGLISH), new HashSet((Collection) j.get(key)));
+ }
+ }
+ }
+ }
+ catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+
+ return entities;
+ }
+
+
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..cea7492
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.geo.GeoParser
+org.apache.tika.parser.journal.JournalParser
+org.apache.tika.parser.sentiment.SentimentAnalysisParser
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
new file mode 100644
index 0000000..a7718ab
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+grobid.server.url=http://localhost:8080
+grobid.endpoint.text=/processQuantityText
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java
new file mode 100644
index 0000000..2b82af9
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class TEITest extends TikaTest {
+
+
+ @Test
+ public void testBasic() throws Exception {
+ TEIDOMParser teiParser = new TEIDOMParser();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (InputStream is = getResourceAsStream("/test-documents/testTEI.xml")) {
+ IOUtils.copy(is, bos);
+ }
+ String xml = new String (bos.toByteArray(), StandardCharsets.UTF_8);
+ Metadata metadata = teiParser.parse(xml, new ParseContext());
+ assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+ "Montbonnot Saint-Martin, null 38330, 38330, 38330, 38330 " +
+ "France, France, France, France ", metadata.get("Address"));
+ String[] keywords = new String[]{
+ "F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms and Problems\u2014Sequencing",
+ "and scheduling; D41 [Operating Systems]: Process management\u2014Scheduling, Concurrency",
+ "Keywords",
+ "Parallel Computing, Algorithms, Scheduling, Parallel Tasks,",
+ "Moldable Tasks, Bi-criteria"
+ };
+ assertArrayEquals(keywords, metadata.getValues("Keyword"));
+ assertEquals("Pierre-François Dutot 1 Lionel Eyraud 1 Grégory Gr´ 1 Grégory Mouní 1 Denis Trystram 1 ",
+ metadata.get("Authors"));
+ assertEquals("Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *",
+ metadata.get("Title"));
+ assertEquals("1 ID-IMAG ID-IMAG ID-IMAG ID-IMAG", metadata.get("Affiliation"));
+ assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+ "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+ "null 38330, 38330, 38330, 38330 France, France, France, France}" +
+ "[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+ "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+ "null 38330, 38330, 38330, 38330 France, France, France, France}]",
+ metadata.get("FullAffiliations"));
+ }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
new file mode 100644
index 0000000..6e17415
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assume.assumeTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest extends TikaTest {
+
+ public static final String CONFIG_FILE = "tika-config.xml";
+
+ @Test
+ public void testParse() throws Exception {
+
+ //test config is added to resources directory
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "I am student at University of Southern California (USC)," +
+ " located in Los Angeles . USC's football team is called by name Trojans." +
+ " Mr. John McKay was a head coach of the team from 1960 - 1975";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+ HashSet<String> set = new HashSet<String>();
+ set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+ assumeTrue(set.contains(NamedEntityParser.class.getName()));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+ assumeTrue(set.contains("John McKay"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+ assumeTrue(set.contains("Los Angeles"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+ assumeTrue(set.contains("University of Southern California"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+ assumeTrue(set.contains("1960 - 1975"));
+
+ }
+
+ @Test
+ public void testNerChain() throws Exception {
+ String classNames = OpenNLPNERecogniser.class.getName()
+ + "," + RegexNERecogniser.class.getName();
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "University of Southern California (USC), is located in Los Angeles ." +
+ " Campus is busy from monday to saturday";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+ HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+ assumeTrue(keys.contains("NER_WEEK_DAY"));
+ assumeTrue(keys.contains("NER_LOCATION"));
+
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..4b0101e
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+ @Test
+ public void testGetEntityTypes() throws Exception {
+ String text = "America is a big country.";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+ Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+ if(names.size() != 0) {
+ assertTrue(names.contains("America"));
+ assertTrue(names.size() == 1);
+ }
+ }
+}
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
new file mode 100755
index 0000000..c17899e
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Getting OpenNLP NER models"
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-person.bin" -O ner-person.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin" -O ner-location.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-organization.bin" -O ner-organization.bin
+
+# Additional 4
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-date.bin" -O ner-date.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-money.bin" -O ner-money.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-time.bin" -O ner-time.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-percentage.bin" -O ner-percentage.bin
\ No newline at end of file
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml
new file mode 100644
index 0000000..267c399
--- /dev/null
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/regex/tika-config.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+ <mime>text/plain</mime>
+ <mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
+ </parser>
+ </parsers>
+
+</properties>
\ No newline at end of file