You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:10 UTC

[01/20] tika git commit: Added NLTK NER

Repository: tika
Updated Branches:
  refs/heads/master 7c245fa87 -> 9056894da


Added NLTK NER


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d685742c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d685742c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d685742c

Branch: refs/heads/master
Commit: d685742c6e81b9153ce881f9622292104a4144d2
Parents: 6a09233
Author: manali <ma...@gmail.com>
Authored: Tue Feb 2 00:12:28 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Feb 2 00:12:28 2016 -0800

----------------------------------------------------------------------
 .gitignore                                      |   3 +-
 tika-parsers/pom.xml                            |   7 +
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 161 +++++++++++++++++++
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  40 +++++
 4 files changed, 210 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..40c895f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ target
 *.iws
 *.bin
 nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml
+*.DS_Store

http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 57497ec..8d330c3 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,6 +88,13 @@
       <version>2.1.1</version>
     </dependency>
 
+
+    <!-- manali added this-->
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpclient</artifactId>
+      <version>4.5.1</version>
+    </dependency>
     <!-- Optional OSGi dependencies, used only when running within OSGi -->
     <dependency>
       <groupId>org.apache.felix</groupId>

http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..eb216ea
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.http.client.methods.HttpGet;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import org.apache.http.HttpResponse;
+import org.apache.http.NameValuePair;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.message.BasicNameValuePair;
+
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ *  due to runtime binding to Stanford CoreNLP.
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK">
+ *      Tika NER Wiki</a> for configuring this recogniser.
+ *  @see NERecogniser
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+    private final static String USER_AGENT = "Mozilla/5.0";
+    private static boolean available = false;
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add(PERSON);
+        add(TIME);
+        add(LOCATION);
+        add(ORGANIZATION);
+        add(MONEY);
+        add(PERCENT);
+        add(DATE);
+        add(FACILITY);
+        add(GPE);
+    }};
+
+    public NLTKNERecogniser(){
+        try {
+
+            String url = "http://localhost:5000/";
+            HttpClient client = HttpClientBuilder.create().build();
+            HttpGet get = new HttpGet(url);
+
+            // add header
+            get.setHeader("User-Agent", USER_AGENT);
+            HttpResponse response = client.execute(get);
+            int responseCode = response.getStatusLine().getStatusCode();
+            if(responseCode == 200){
+                available = true;
+            }
+            else{
+                LOG.info("NLTKRest Server is not running");
+            }
+
+        } catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+    }
+
+
+    /**
+     *
+     * @return {@code true} if model was available, valid and was able to initialise the classifier.
+     * returns {@code false} when this recogniser is not available for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> entities = new HashMap<>();
+        try {
+            String url = "http://localhost:5000/nltk";
+            HttpClient client = HttpClientBuilder.create().build();
+            HttpPost post = new HttpPost(url);
+            // add header
+            post.setHeader("User-Agent", USER_AGENT);
+            List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
+            urlParameters.add(new BasicNameValuePair("text", text));
+            post.setEntity(new UrlEncodedFormEntity(urlParameters));
+
+            HttpResponse response = client.execute(post);
+
+            int responseCode = response.getStatusLine().getStatusCode();
+            if (responseCode == 200) {
+                BufferedReader rd = new BufferedReader(
+                        new InputStreamReader(response.getEntity().getContent()));
+
+                String result = rd.readLine();
+
+                JSONParser parser = new JSONParser();
+                JSONObject j = (JSONObject) parser.parse(result);
+                JSONArray aa = new JSONArray();
+                for (Object x : j.keySet()) {
+                    aa = (JSONArray) j.get(x.toString());
+                    Set s = new HashSet();
+                    for (Object y : aa) {
+                        s.add(y.toString());
+                    }
+                    entities.put(x.toString(), s);
+                }
+            }
+        }
+        catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        ENTITY_TYPES.clear();
+        ENTITY_TYPES.addAll(entities.keySet());
+        LOG.info("returning this:" + entities.keySet().toString());
+        return entities;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..4fbeb42
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,40 @@
+package org.apache.tika.parser.ner.nltk;
+
+/**
+ * Created by manali on 2/1/16.
+ */
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+    @Test
+    public void testGetEntityTypes() throws Exception {
+
+        String text = "America";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+
+        Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+
+        Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
+        if(gpe.size() == 0) return;
+        else {
+            assertTrue(gpe.contains("America"));
+            assertTrue(gpe.size() == 1); //and nothing else
+        }
+    }
+}


[05/20] tika git commit: Update NLTKNERecogniserTest.java

Posted by ma...@apache.org.
Update NLTKNERecogniserTest.java

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/892beca8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/892beca8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/892beca8

Branch: refs/heads/master
Commit: 892beca8b7c76c7849e9a71a0e252fca606753af
Parents: 59ddcaa
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:36:22 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:36:22 2016 -0800

----------------------------------------------------------------------
 .../tika/parser/ner/nltk/NLTKNERecogniserTest.java  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/892beca8/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 23a174c..ac04066 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.parser.ner.nltk;
 
 import org.apache.commons.logging.Log;


[08/20] tika git commit: Merge remote-tracking branch 'upstream/master'

Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f054bcd1
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f054bcd1
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f054bcd1

Branch: refs/heads/master
Commit: f054bcd1a0a57a730ebc53e7a2cfd4215a7526fb
Parents: 1b14b39 28b9a66
Author: manali <ma...@gmail.com>
Authored: Fri Feb 19 17:37:29 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 19 17:37:29 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt                                     |    8 +
 tika-bundle/pom.xml                             |    6 +-
 .../apache/tika/metadata/TikaMetadataKeys.java  |    2 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |   17 +-
 tika-parsers/pom.xml                            |    6 +-
 .../microsoft/AbstractPOIFSExtractor.java       |   16 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java |    9 +-
 .../tika/parser/pot/PooledTimeSeriesParser.java |    7 +
 .../tika/parser/rtf/RTFEmbObjHandler.java       |    2 +-
 .../org/apache/tika/mime/TestMimeTypes.java     |   25 +
 .../microsoft/POIContainerExtractionTest.java   |   36 +
 .../parser/microsoft/PowerPointParserTest.java  |    4 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   |    9 +-
 .../resources/test-documents/testJS_HTML.js     |   91 +
 .../test-documents/testPKCS17Sig-v2.xml.p7m     |  Bin 0 -> 9682 bytes
 .../test-documents/testPKCS17Sig-v3.xml.p7m     |  305 ++
 .../test-documents/testPKCS17Sig-v4.xml.p7m     | 1606 +++++++
 .../test-documents/testPKCS17Sig.xml.p7m        | 4333 ++++++++++++++++++
 .../tika/server/resource/TikaResource.java      |   14 +-
 .../apache/tika/server/StackTraceOffTest.java   |    2 -
 .../org/apache/tika/server/StackTraceTest.java  |    2 -
 .../apache/tika/server/TikaResourceTest.java    |   12 +
 .../services/org.apache.tika.parser.Parser      |   16 -
 .../testRTF_npeFromWMFInTikaServer.rtf          |  235 +
 24 files changed, 6713 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f054bcd1/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/tika/blob/f054bcd1/tika-parsers/pom.xml
----------------------------------------------------------------------


[20/20] tika git commit: Fix merge conflict.

Posted by ma...@apache.org.
Fix merge conflict.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9056894d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9056894d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9056894d

Branch: refs/heads/master
Commit: 9056894da580107d1a5a21b29a0b7042ffa15c42
Parents: 3fbc03c 7c245fa
Author: Chris Mattmann <ma...@apache.org>
Authored: Tue Mar 1 21:41:57 2016 -0800
Committer: Chris Mattmann <ma...@apache.org>
Committed: Tue Mar 1 21:41:57 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  20 ++
 .../org/apache/tika/parser/pdf/PDFParser.java   |  35 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java |  36 ++-
 .../apache/tika/parser/pdf/XFAExtractor.java    | 318 +++++++++++++++++++
 .../apache/tika/parser/pdf/PDFParser.properties |   3 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  32 +-
 .../testPDF_XFA_govdocs1_258578.pdf             | Bin 0 -> 168176 bytes
 8 files changed, 442 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9056894d/CHANGES.txt
----------------------------------------------------------------------
diff --cc CHANGES.txt
index d5bebcd,05d6d76..e6603fa
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@@ -1,8 -1,7 +1,10 @@@
  Release 1.13 - ???
  
 +  * Tika now incorporates the Natural Language Toolkit (NLTK) from the
 +    Python community as an option for Named Entity Recognition (TIKA-1876).
 +
+   * Add support for XFA extraction via Pascal Essiembre (TIKA-1857).
+ 
    * Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861).  NOTE: this dependency
      is still <scope>provided</scope>.  You need to include this dependency
      in order to parser sqlite files.


[11/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a13369b0
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a13369b0
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a13369b0

Branch: refs/heads/master
Commit: a13369b098bea09421e35023c131adc092dcb6e4
Parents: 6c595fb
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 18:21:15 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 18:21:15 2016 -0800

----------------------------------------------------------------------
 .../org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java   | 9 ---------
 .../org/apache/tika/parser/ner/nltk/NLTKServer.properties   | 2 +-
 .../apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java   | 2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index eddddcb..1edfe28 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -45,15 +45,6 @@ public class NLTKNERecogniser implements NERecogniser {
      * some common entities identified by NLTK
      */
     public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
-        add(PERSON);
-        add(TIME);
-        add(LOCATION);
-        add(ORGANIZATION);
-        add(MONEY);
-        add(PERCENT);
-        add(DATE);
-        add(FACILITY);
-        add(GPE);
         add("NAMES");
     }};
 

http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
index 24f5a2e..5909b69 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -13,4 +13,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-nltk.server.url=http://localhost:5000
+nltk.server.url=http://localhost:8881

http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index a40ec24..5c1307f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -36,7 +36,7 @@ import static org.junit.Assert.assertTrue;
 public class NLTKNERecogniserTest {
     @Test
     public void testGetEntityTypes() throws Exception {
-        String text = "America";
+        String text = "America is a big country.";
         System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
         Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
         Metadata md = new Metadata();


[06/20] tika git commit: Used Apache CXF WebClient

Posted by ma...@apache.org.
Used Apache CXF WebClient


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/14ca3204
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/14ca3204
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/14ca3204

Branch: refs/heads/master
Commit: 14ca32045361918f5dd28c63c9692accbcfa31d5
Parents: 892beca
Author: manali <ma...@gmail.com>
Authored: Sat Feb 6 17:00:38 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Sat Feb 6 17:00:38 2016 -0800

----------------------------------------------------------------------
 .../apache/tika/parser/ner/NERecogniser.java    |  2 ++
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 34 +++++++-------------
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  7 ++--
 3 files changed, 18 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
index c4693eb..3bebff2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
@@ -36,6 +36,8 @@ public interface NERecogniser {
     String DATE = "DATE";
     String PERCENT = "PERCENT";
     String MONEY = "MONEY";
+    String FACILITY = "FACILITY";
+    String GPE = "GPE";
 
     /**
      * checks if this Named Entity recogniser is available for service

http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index cb152f3..99cde6f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -41,6 +41,13 @@ import org.apache.http.client.methods.HttpPost;
 import org.apache.http.impl.client.HttpClientBuilder;
 import org.apache.http.message.BasicNameValuePair;
 
+import javax.ws.rs.core.Form;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 
 /**
  *  This class offers an implementation of {@link NERecogniser} based on
@@ -73,13 +80,8 @@ public class NLTKNERecogniser implements NERecogniser {
     public NLTKNERecogniser(){
         try {
             String url = "http://localhost:5000/";
-            HttpClient client = HttpClientBuilder.create().build();
-            HttpGet get = new HttpGet(url);
-
-            // add header
-            get.setHeader("User-Agent", USER_AGENT);
-            HttpResponse response = client.execute(get);
-            int responseCode = response.getStatusLine().getStatusCode();
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+            int responseCode = response.getStatus();
             if(responseCode == 200){
                 available = true;
             }
@@ -118,22 +120,10 @@ public class NLTKNERecogniser implements NERecogniser {
         Map<String, Set<String>> entities = new HashMap<>();
         try {
             String url = "http://localhost:5000/nltk";
-            HttpClient client = HttpClientBuilder.create().build();
-            HttpPost post = new HttpPost(url);
-            post.setHeader("User-Agent", USER_AGENT);
-            List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
-            urlParameters.add(new BasicNameValuePair("text", text));
-            post.setEntity(new UrlEncodedFormEntity(urlParameters));
-
-            HttpResponse response = client.execute(post);
-
-            int responseCode = response.getStatusLine().getStatusCode();
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+            int responseCode = response.getStatus();
             if (responseCode == 200) {
-                BufferedReader rd = new BufferedReader(
-                        new InputStreamReader(response.getEntity().getContent()));
-
-                String result = rd.readLine();
-
+                String result = response.readEntity(String.class);
                 JSONParser parser = new JSONParser();
                 JSONObject j = (JSONObject) parser.parse(result);
                 JSONArray aa = new JSONArray();

http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index ac04066..563e836 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -21,6 +21,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.io.ByteArrayInputStream;
@@ -34,7 +35,6 @@ import static org.junit.Assert.assertTrue;
 public class NLTKNERecogniserTest {
     @Test
     public void testGetEntityTypes() throws Exception {
-
         String text = "America";
         System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
 
@@ -42,9 +42,10 @@ public class NLTKNERecogniserTest {
         Metadata md = new Metadata();
         tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
 
-
         Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
-        if(gpe.size() == 0) return;
+        if(gpe.size() == 0) {
+            return;
+        }
         else {
             assertTrue(gpe.contains("America"));
             assertTrue(gpe.size() == 1); //and nothing else


[18/20] tika git commit: resolved conflicts

Posted by ma...@apache.org.
resolved conflicts


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e147de34
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e147de34
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e147de34

Branch: refs/heads/master
Commit: e147de3429b775f5036c78b18a1cb688971be0af
Parents: 0dbd69c cdb684d
Author: manali <ma...@gmail.com>
Authored: Tue Mar 1 01:05:34 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Mar 1 01:05:34 2016 -0800

----------------------------------------------------------------------
 .../apache/tika/sax/RichTextContentHandler.java | 58 ++++++++++++++++++++
 .../tika/server/RichTextContentHandler.java     | 58 --------------------
 2 files changed, 58 insertions(+), 58 deletions(-)
----------------------------------------------------------------------



[17/20] tika git commit: updated with changes

Posted by ma...@apache.org.
updated with changes


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0dbd69ce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0dbd69ce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0dbd69ce

Branch: refs/heads/master
Commit: 0dbd69cef5ec603a11d7f5b52f119e3bea1550b5
Parents: 7ebe007
Author: manali <ma...@gmail.com>
Authored: Tue Mar 1 00:59:11 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Mar 1 00:59:11 2016 -0800

----------------------------------------------------------------------
 .../tika/sax/RichTextContentHandlerTest.java    | 75 ++++++++++++++++++++
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 16 +++--
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  2 +-
 3 files changed, 86 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
new file mode 100644
index 0000000..257ea38
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Test cases for the {@link RichTextContentHandler} class.
+ */
+public class RichTextContentHandlerTest {
+
+    /**
+     * Test to check img tags are detected and rich text version used.
+     */
+    @Test
+    public void aTagTest() throws Exception {
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                new RichTextContentHandler(
+                    new OutputStreamWriter(buffer, Charset.defaultCharset())),
+                new Metadata());
+        xhtml.startDocument();
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "", "name", "", "value");
+        xhtml.startElement("a", attributes);
+        xhtml.endDocument();
+
+        assertEquals("\n\n\n\n[bookmark: value]", buffer.toString(UTF_8.name()));
+    }
+
+    /**
+     * Test to check a tags are detected and rich text version used.
+     */
+    @Test
+    public void imgTagTest() throws Exception {
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                new RichTextContentHandler(
+                    new OutputStreamWriter(buffer, Charset.defaultCharset())),
+                new Metadata());
+        xhtml.startDocument();
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "", "alt", "", "value");
+        xhtml.startElement("img", attributes);
+        xhtml.endDocument();
+
+        assertEquals("\n\n\n\n[image: value]", buffer.toString(UTF_8.name()));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 1edfe28..5407189 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -22,8 +22,13 @@ import org.json.simple.parser.JSONParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.*;
-import java.util.*;
+import java.io.IOException;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collection;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Properties;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
@@ -41,6 +46,7 @@ public class NLTKNERecogniser implements NERecogniser {
     private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
     private static boolean available = false;
     private static final String NLTK_REST_HOST = "http://localhost:8881";
+    private String restHostUrlStr;
      /**
      * some common entities identified by NLTK
      */
@@ -48,7 +54,7 @@ public class NLTKNERecogniser implements NERecogniser {
         add("NAMES");
     }};
 
-    String restHostUrlStr;
+
     public NLTKNERecogniser(){
         try {
 
@@ -59,8 +65,7 @@ public class NLTKNERecogniser implements NERecogniser {
                 e.printStackTrace();
             }
 
-            if (restHostUrlStr == null
-                    || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+            if (restHostUrlStr == null || restHostUrlStr.equals("")) {
                 this.restHostUrlStr = NLTK_REST_HOST;
             } else {
                 this.restHostUrlStr = restHostUrlStr;
@@ -115,7 +120,6 @@ public class NLTKNERecogniser implements NERecogniser {
     public Map<String, Set<String>> recognise(String text) {
         Map<String, Set<String>> entities = new HashMap<>();
         try {
-            int port = 8881;
             String url = restHostUrlStr + "/nltk";
             Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
             int responseCode = response.getStatus();

http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 5c1307f..94d9a27 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -48,7 +48,7 @@ public class NLTKNERecogniserTest {
         }
         else {
             assertTrue(names.contains("America"));
-            assertTrue(names.size() == 1); //and nothing else
+            assertTrue(names.size() == 1); 
         }
     }
 }


[16/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cdb684d9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cdb684d9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cdb684d9

Branch: refs/heads/master
Commit: cdb684d9c1b0ebb01a783180f07417760fa04d6f
Parents: 114d0ff
Author: manali <ma...@gmail.com>
Authored: Sat Feb 27 02:10:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Sat Feb 27 02:10:06 2016 -0800

----------------------------------------------------------------------
 .../java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/cdb684d9/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 850f4dd..3e881fe 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -68,7 +68,7 @@ public class NLTKNERecogniser implements NERecogniser {
                 this.restHostUrlStr = restHostUrlStr;
             }
             //check if nltkrest is running 
-            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+            Response response = WebClient.create(restHostUrlStr).get();
             int responseCode = response.getStatus();
             if(responseCode == 200){
                 available = true;
@@ -116,7 +116,7 @@ public class NLTKNERecogniser implements NERecogniser {
         try {
             int port = 8881;
             String url = restHostUrlStr + "/nltk";
-            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+            Response response = WebClient.create(url).post(text);
             int responseCode = response.getStatus();
             if (responseCode == 200) {
                 String result = response.readEntity(String.class);


[09/20] tika git commit: created NLTK host server properties

Posted by ma...@apache.org.
created NLTK host server properties


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ac4c0b2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ac4c0b2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ac4c0b2c

Branch: refs/heads/master
Commit: ac4c0b2c9321bba395b92214a3504d8346c3e936
Parents: f054bcd
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:23:26 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:23:26 2016 -0800

----------------------------------------------------------------------
 tika-parsers/pom.xml                            | 15 ++--
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 72 ++++++++++----------
 .../tika/parser/ner/nltk/NLTKServer.properties  | 16 +++++
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  8 +--
 4 files changed, 63 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 41daf4d..088a6e9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,13 +88,6 @@
       <version>2.1.1</version>
     </dependency>
 
-
-    <!-- manali added this-->
-    <dependency>
-      <groupId>org.apache.httpcomponents</groupId>
-      <artifactId>httpclient</artifactId>
-      <version>4.5.1</version>
-    </dependency>
     <!-- Optional OSGi dependencies, used only when running within OSGi -->
     <dependency>
       <groupId>org.apache.felix</groupId>
@@ -366,6 +359,14 @@
       <version>3.2.2</version>
       <scope>provided</scope>
     </dependency>
+
+    <!--Jackson parse String to JSON-->
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+      <version>2.7.1</version>
+    </dependency>
+
   </dependencies>
 
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 99cde6f..eddddcb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -16,38 +16,18 @@
  */
 package org.apache.tika.parser.ner.nltk;
 
-import org.apache.http.client.methods.HttpGet;
 import org.apache.tika.parser.ner.NERecogniser;
-import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.message.BasicNameValuePair;
-
-import javax.ws.rs.core.Form;
+import java.io.*;
+import java.util.*;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
 import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
-import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 
 /**
  *  This class offers an implementation of {@link NERecogniser} based on
@@ -59,9 +39,8 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 public class NLTKNERecogniser implements NERecogniser {
 
     private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
-    private final static String USER_AGENT = "Mozilla/5.0";
     private static boolean available = false;
-    
+    private static final String NLTK_REST_HOST = "http://localhost:8881";
      /**
      * some common entities identified by NLTK
      */
@@ -75,12 +54,31 @@ public class NLTKNERecogniser implements NERecogniser {
         add(DATE);
         add(FACILITY);
         add(GPE);
+        add("NAMES");
     }};
 
+    String restHostUrlStr;
     public NLTKNERecogniser(){
         try {
-            String url = "http://localhost:5000/";
-            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+
+            String restHostUrlStr="";
+            try {
+                restHostUrlStr = readRestUrl();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+
+            if (restHostUrlStr == null
+                    || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+                this.restHostUrlStr = NLTK_REST_HOST;
+            } else {
+                this.restHostUrlStr = restHostUrlStr;
+            }
+
+
+
+
+            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
             int responseCode = response.getStatus();
             if(responseCode == 200){
                 available = true;
@@ -94,6 +92,13 @@ public class NLTKNERecogniser implements NERecogniser {
         }
     }
 
+    private static String readRestUrl() throws IOException {
+        Properties nltkProperties = new Properties();
+        nltkProperties.load(NLTKNERecogniser.class
+                .getResourceAsStream("NLTKServer.properties"));
+
+        return nltkProperties.getProperty("nltk.server.url");
+    }
 
     /**
      * @return {@code true} if server endpoint is available.
@@ -119,22 +124,15 @@ public class NLTKNERecogniser implements NERecogniser {
     public Map<String, Set<String>> recognise(String text) {
         Map<String, Set<String>> entities = new HashMap<>();
         try {
-            String url = "http://localhost:5000/nltk";
-            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+            int port = 8881;
+            String url = restHostUrlStr + "/nltk";
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
             int responseCode = response.getStatus();
             if (responseCode == 200) {
                 String result = response.readEntity(String.class);
                 JSONParser parser = new JSONParser();
                 JSONObject j = (JSONObject) parser.parse(result);
-                JSONArray aa = new JSONArray();
-                for (Object x : j.keySet()) {
-                    aa = (JSONArray) j.get(x.toString());
-                    Set s = new HashSet();
-                    for (Object y : aa) {
-                        s.add(y.toString());
-                    }
-                    entities.put(x.toString(), s);
-                }
+                Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
             }
         }
         catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..24f5a2e
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+nltk.server.url=http://localhost:5000

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 2861051..a40ec24 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -42,13 +42,13 @@ public class NLTKNERecogniserTest {
         Metadata md = new Metadata();
         tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
 
-        Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
-        if(gpe.size() == 0) {
+        Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+        if(names.size() == 0) {
             return;
         }
         else {
-            assertTrue(gpe.contains("America"));
-            assertTrue(gpe.size() == 1); //and nothing else
+            assertTrue(names.contains("America"));
+            assertTrue(names.size() == 1); //and nothing else
         }
     }
 }


[14/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a7e24c9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a7e24c9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a7e24c9

Branch: refs/heads/master
Commit: 3a7e24c9a5d77ae41bde0c2106233a2064c5e707
Parents: c809690
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 20:00:05 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 20:00:05 2016 -0800

----------------------------------------------------------------------
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3a7e24c9/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 8093709..40c895f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ target
 *.iws
 *.bin
 nbactions.xml
-nb-configuration.xml*.DS_Store
+nb-configuration.xml
+*.DS_Store


[12/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ebe007e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ebe007e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ebe007e

Branch: refs/heads/master
Commit: 7ebe007ec03088449f67619ef1e6cb564178b14b
Parents: a13369b
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 18:36:02 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 18:36:02 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt                                          |  2 ++
 .../src/main/java/org/apache/tika/mime/MimeType.java |  1 -
 .../org/apache/tika/mime/tika-mimetypes.xml          | 13 ++++++-------
 .../tika/parser/microsoft/ooxml/XWPFListManager.java |  4 ++++
 .../org/apache/tika/parser/ner/NERecogniser.java     |  2 --
 .../apache/tika/server/RichTextContentHandler.java   | 15 +++++++++++++--
 .../apache/tika/server/resource/TikaResource.java    |  2 +-
 .../tika/server/resource/UnpackerResource.java       |  2 +-
 8 files changed, 27 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bb30540..0ffc69f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,8 @@ Release 1.13 - ???
 
   * Upgrade to Jackson 2.7.1 (TIKA-1869).
 
+  * RichTextContentHandler moved from the Server package to Core (TIKA-1870).
+
 Release 1.12 - 01/24/2016
 
   * Support for iFrames and element link extraction is provided in

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index fc520cf..b4d651e 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -270,7 +270,6 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
         }
     }
 
-
     void addMagic(Magic magic) {
         if (magic == null) {
             return;

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5bb30fc..95f41e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -38,12 +38,6 @@
 -->
 <mime-info>
 
-  <mime-type type="application/dicom">
-    <_comment>DICOM medical imaging data</_comment>
-    <magic priority="50">
-      <match value="DICM" type="string" offset="128"/>
-    </magic>
-  </mime-type>
   <mime-type type="application/activemessage"/>
   <mime-type type="application/andrew-inset">
     <glob pattern="*.ez"/>
@@ -118,7 +112,12 @@
   <mime-type type="application/dec-dx"/>
   <mime-type type="application/dialog-info+xml"/>
 
-
+  <mime-type type="application/dicom">
+    <_comment>DICOM medical imaging data</_comment>
+    <magic priority="50">
+      <match value="DICM" type="string" offset="128"/>
+    </magic>
+  </mime-type>
 
   <mime-type type="application/dita+xml">
     <sub-class-of type="application/xml"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index 5654378..a938c2f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -57,6 +57,10 @@ public class XWPFListManager extends AbstractListManager {
      * @return the formatted number or an empty string if something went wrong
      */
     public String getFormattedNumber(final XWPFParagraph paragraph) {
+        if (numbering == null) {
+            return "";
+        }
+
         int currNumId = paragraph.getNumID().intValue();
         XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
         if (xwpfNum == null) {

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
index 3bebff2..c4693eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
@@ -36,8 +36,6 @@ public interface NERecogniser {
     String DATE = "DATE";
     String PERCENT = "PERCENT";
     String MONEY = "MONEY";
-    String FACILITY = "FACILITY";
-    String GPE = "GPE";
 
     /**
      * checks if this Named Entity recogniser is available for service

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
index 81095a7..8fcc4d5 100644
--- a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
+++ b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
@@ -15,15 +15,26 @@
  * limitations under the License.
  */
 
-package org.apache.tika.server;
+package org.apache.tika.sax;
 
 import java.io.Writer;
 
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
+/**
+ * Content handler for Rich Text, it will extract XHTML &lt;img/&gt;
+ * tag &lt;alt/&gt; attribute and XHTML &lt;a/&gt; tag &lt;name/&gt;
+ * attribute into the output.
+ */
 public class RichTextContentHandler extends WriteOutContentHandler {
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
     public RichTextContentHandler(Writer writer) {
         super(writer);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index d74ef74..566203a 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -72,7 +72,7 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
+import org.apache.tika.sax.RichTextContentHandler;
 import org.apache.tika.server.TikaServerParseException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;

http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index cf3a0e9..8ee516e 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -58,7 +58,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
+import org.apache.tika.sax.RichTextContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;


[07/20] tika git commit: nltk modification

Posted by ma...@apache.org.
nltk modification


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1b14b39d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1b14b39d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1b14b39d

Branch: refs/heads/master
Commit: 1b14b39d3e1b453620b2f7b26a933103a78c958a
Parents: 14ca320
Author: manali <ma...@gmail.com>
Authored: Fri Feb 19 17:37:25 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 19 17:37:25 2016 -0800

----------------------------------------------------------------------
 .../src/main/java/org/apache/tika/mime/MimeType.java   |  1 +
 .../resources/org/apache/tika/mime/tika-mimetypes.xml  | 13 +++++++------
 .../tika/parser/ner/nltk/NLTKNERecogniserTest.java     |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index b4d651e..fc520cf 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -270,6 +270,7 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
         }
     }
 
+
     void addMagic(Magic magic) {
         if (magic == null) {
             return;

http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 1d7b42b..52dd67b 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -38,6 +38,12 @@
 -->
 <mime-info>
 
+  <mime-type type="application/dicom">
+    <_comment>DICOM medical imaging data</_comment>
+    <magic priority="50">
+      <match value="DICM" type="string" offset="128"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/activemessage"/>
   <mime-type type="application/andrew-inset">
     <glob pattern="*.ez"/>
@@ -112,12 +118,7 @@
   <mime-type type="application/dec-dx"/>
   <mime-type type="application/dialog-info+xml"/>
 
-  <mime-type type="application/dicom">
-    <_comment>DICOM medical imaging data</_comment>
-    <magic priority="50">
-      <match value="DICM" type="string" offset="128"/>
-    </magic>
-  </mime-type>
+
 
   <mime-type type="application/dita+xml">
     <sub-class-of type="application/xml"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 563e836..2861051 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -25,6 +25,7 @@ import org.junit.Ignore;
 import org.junit.Test;
 
 import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.HashSet;
@@ -37,7 +38,6 @@ public class NLTKNERecogniserTest {
     public void testGetEntityTypes() throws Exception {
         String text = "America";
         System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
-
         Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
         Metadata md = new Metadata();
         tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);


[10/20] tika git commit: Merge remote-tracking branch 'upstream/master'

Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c595fbd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c595fbd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c595fbd

Branch: refs/heads/master
Commit: 6c595fbd3ecefe598cc96b34e2e0baa1f064f6df
Parents: ac4c0b2 08e38bb
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:25:07 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:25:07 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt                                     |  2 ++
 tika-translate/pom.xml                          |  2 +-
 .../language/translate/CachedTranslator.java    | 25 ++++++++++++--------
 3 files changed, 18 insertions(+), 11 deletions(-)
----------------------------------------------------------------------



[03/20] tika git commit: Update NLTKNERecogniser.java

Posted by ma...@apache.org.
Update NLTKNERecogniser.java

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/db2b4757
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/db2b4757
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/db2b4757

Branch: refs/heads/master
Commit: db2b475733dffca63143551a5f1ddd89d97f0960
Parents: 2b99eea
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:33:17 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:33:17 2016 -0800

----------------------------------------------------------------------
 .../tika/parser/ner/nltk/NLTKNERecogniser.java    | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/db2b4757/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index eb216ea..cb152f3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -44,11 +44,9 @@ import org.apache.http.message.BasicNameValuePair;
 
 /**
  *  This class offers an implementation of {@link NERecogniser} based on
- *  CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
- *  due to runtime binding to Stanford CoreNLP.
+ *  ne_chunk() module of NLTK. This NER requires additional setup,
+ *  due to Http requests to an endpoint server that runs NLTK.
  *  See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK">
- *      Tika NER Wiki</a> for configuring this recogniser.
- *  @see NERecogniser
  *
  */
 public class NLTKNERecogniser implements NERecogniser {
@@ -56,6 +54,10 @@ public class NLTKNERecogniser implements NERecogniser {
     private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
     private final static String USER_AGENT = "Mozilla/5.0";
     private static boolean available = false;
+    
+     /**
+     * some common entities identified by NLTK
+     */
     public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
         add(PERSON);
         add(TIME);
@@ -70,7 +72,6 @@ public class NLTKNERecogniser implements NERecogniser {
 
     public NLTKNERecogniser(){
         try {
-
             String url = "http://localhost:5000/";
             HttpClient client = HttpClientBuilder.create().build();
             HttpGet get = new HttpGet(url);
@@ -93,9 +94,8 @@ public class NLTKNERecogniser implements NERecogniser {
 
 
     /**
-     *
-     * @return {@code true} if model was available, valid and was able to initialise the classifier.
-     * returns {@code false} when this recogniser is not available for service.
+     * @return {@code true} if server endpoint is available.
+     * returns {@code false} if server endpoint is not avaliable for service.
      */
     public boolean isAvailable() {
         return available;
@@ -120,7 +120,6 @@ public class NLTKNERecogniser implements NERecogniser {
             String url = "http://localhost:5000/nltk";
             HttpClient client = HttpClientBuilder.create().build();
             HttpPost post = new HttpPost(url);
-            // add header
             post.setHeader("User-Agent", USER_AGENT);
             List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
             urlParameters.add(new BasicNameValuePair("text", text));
@@ -153,7 +152,6 @@ public class NLTKNERecogniser implements NERecogniser {
         }
         ENTITY_TYPES.clear();
         ENTITY_TYPES.addAll(entities.keySet());
-        LOG.info("returning this:" + entities.keySet().toString());
         return entities;
     }
 


[02/20] tika git commit: Merge remote-tracking branch 'upstream/master' Integrated NLTK into Tika Parsers by using endpoint as NLTKRest

Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'
Integrated NLTK into Tika Parsers by using endpoint as NLTKRest


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2b99eeab
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2b99eeab
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2b99eeab

Branch: refs/heads/master
Commit: 2b99eeab8bb4c8aaaf13202904910298dc446c05
Parents: d685742 256209a
Author: manali <ma...@gmail.com>
Authored: Tue Feb 2 00:18:16 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Feb 2 00:18:16 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt                                                  | 4 ++++
 tika-parsers/pom.xml                                         | 2 +-
 .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java  | 8 ++------
 3 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/2b99eeab/tika-parsers/pom.xml
----------------------------------------------------------------------


[15/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/114d0ff2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/114d0ff2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/114d0ff2

Branch: refs/heads/master
Commit: 114d0ff24bd04395852012a3382d50c3e906e6db
Parents: 3a7e24c
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 20:06:20 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 20:06:20 2016 -0800

----------------------------------------------------------------------
 tika-parsers/pom.xml | 8 ++++++++
 1 file changed, 8 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/114d0ff2/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 885bbc2..4f92b1b 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -359,6 +359,14 @@
       <version>3.2.2</version>
       <scope>provided</scope>
     </dependency>
+    
+    <!--Jackson parse String to JSON-->
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+      <version>2.7.1</version>
+    </dependency>
+
   </dependencies>
 
   <build>


[04/20] tika git commit: Update NLTKNERecogniserTest.java

Posted by ma...@apache.org.
Update NLTKNERecogniserTest.java

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/59ddcaac
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/59ddcaac
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/59ddcaac

Branch: refs/heads/master
Commit: 59ddcaac5f479b4485d35c061e81dfa612773aab
Parents: db2b475
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:35:41 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:35:41 2016 -0800

----------------------------------------------------------------------
 .../org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java     | 3 ---
 1 file changed, 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/59ddcaac/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 4fbeb42..23a174c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -1,8 +1,5 @@
 package org.apache.tika.parser.ner.nltk;
 
-/**
- * Created by manali on 2/1/16.
- */
 import org.apache.commons.logging.Log;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;


[13/20] tika git commit: fix for TIKA-1876 contributed by manalishah

Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c809690e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c809690e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c809690e

Branch: refs/heads/master
Commit: c809690ec87ffa600018dbc5eee6d6756645adb0
Parents: ed762b7
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 19:58:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 19:58:06 2016 -0800

----------------------------------------------------------------------
 .gitignore                                      |   2 +-
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 137 +++++++++++++++++++
 .../tika/parser/ner/nltk/NLTKServer.properties  |  16 +++
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  54 ++++++++
 4 files changed, 208 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..8093709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ target
 *.iws
 *.bin
 nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml*.DS_Store

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..850f4dd
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.*;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  ne_chunk() module of NLTK. This NER requires additional setup,
+ *  due to Http requests to an endpoint server that runs NLTK.
+ *  This endpoint has been implemented as pip/setuptools installable python module
+ *  See <a href="https://github.com/manalishah/NLTKRest"></a>
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+    private static boolean available = false;
+    private static final String NLTK_REST_HOST = "http://localhost:8881";
+     /**
+     * some common entities identified by NLTK
+     */
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add("NAMES");
+    }};
+
+    String restHostUrlStr;
+    public NLTKNERecogniser(){
+        try {
+
+            String restHostUrlStr="";
+            try {
+                restHostUrlStr = readRestUrl();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+
+            if (restHostUrlStr == null
+                    || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+                this.restHostUrlStr = NLTK_REST_HOST;
+            } else {
+                this.restHostUrlStr = restHostUrlStr;
+            }
+            //check if nltkrest is running 
+            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+            int responseCode = response.getStatus();
+            if(responseCode == 200){
+                available = true;
+            }
+            else{
+                LOG.info("NLTKRest Server is not running");
+            }
+
+        } catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+    }
+
+    private static String readRestUrl() throws IOException {
+        Properties nltkProperties = new Properties();
+        nltkProperties.load(NLTKNERecogniser.class
+                .getResourceAsStream("NLTKServer.properties"));
+
+        return nltkProperties.getProperty("nltk.server.url");
+    }
+
+    /**
+     * @return {@code true} if server endpoint is available.
+     * returns {@code false} if server endpoint is not avaliable for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> entities = new HashMap<>();
+        try {
+            int port = 8881;
+            String url = restHostUrlStr + "/nltk";
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+            int responseCode = response.getStatus();
+            if (responseCode == 200) {
+                String result = response.readEntity(String.class);
+                JSONParser parser = new JSONParser();
+                JSONObject j = (JSONObject) parser.parse(result);
+                Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
+            }
+        }
+        catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        ENTITY_TYPES.clear();
+        ENTITY_TYPES.addAll(entities.keySet());
+        return entities;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..5909b69
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+nltk.server.url=http://localhost:8881

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..5c1307f
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+    @Test
+    public void testGetEntityTypes() throws Exception {
+        String text = "America is a big country.";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+        Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+        Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+        if(names.size() == 0) {
+            return;
+        }
+        else {
+            assertTrue(names.contains("America"));
+            assertTrue(names.size() == 1); //and nothing else
+        }
+    }
+}


[19/20] tika git commit: Fix for TIKA-1876 Integrate Natural Language Toolkit (NLTK) into Tika to perform Named Entity Recognition contributed by Manali Shah this closes #80

Posted by ma...@apache.org.
Fix for TIKA-1876 Integrate Natural Language Toolkit (NLTK) into Tika to perform Named Entity Recognition contributed by Manali Shah <ma...@gmail.com> this closes #80


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3fbc03ce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3fbc03ce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3fbc03ce

Branch: refs/heads/master
Commit: 3fbc03cead1c54bd023a19e52e31609b51926d7d
Parents: e147de3
Author: Chris Mattmann <ma...@apache.org>
Authored: Tue Mar 1 21:41:07 2016 -0800
Committer: Chris Mattmann <ma...@apache.org>
Committed: Tue Mar 1 21:41:07 2016 -0800

----------------------------------------------------------------------
 CHANGES.txt | 4 ++++
 1 file changed, 4 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3fbc03ce/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 0ffc69f..d5bebcd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,8 @@
 Release 1.13 - ???
+
+  * Tika now incorporates the Natural Language Toolkit (NLTK) from the
+    Python community as an option for Named Entity Recognition (TIKA-1876).
+
   * Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861).  NOTE: this dependency
     is still <scope>provided</scope>.  You need to include this dependency
     in order to parser sqlite files.