You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:10 UTC
[01/20] tika git commit: Added NLTK NER
Repository: tika
Updated Branches:
refs/heads/master 7c245fa87 -> 9056894da
Added NLTK NER
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d685742c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d685742c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d685742c
Branch: refs/heads/master
Commit: d685742c6e81b9153ce881f9622292104a4144d2
Parents: 6a09233
Author: manali <ma...@gmail.com>
Authored: Tue Feb 2 00:12:28 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Feb 2 00:12:28 2016 -0800
----------------------------------------------------------------------
.gitignore | 3 +-
tika-parsers/pom.xml | 7 +
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 161 +++++++++++++++++++
.../parser/ner/nltk/NLTKNERecogniserTest.java | 40 +++++
4 files changed, 210 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..40c895f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ target
*.iws
*.bin
nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml
+*.DS_Store
http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 57497ec..8d330c3 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,6 +88,13 @@
<version>2.1.1</version>
</dependency>
+
+ <!-- manali added this-->
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>4.5.1</version>
+ </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
<groupId>org.apache.felix</groupId>
http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..eb216ea
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.http.client.methods.HttpGet;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import org.apache.http.HttpResponse;
+import org.apache.http.NameValuePair;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.message.BasicNameValuePair;
+
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ * due to runtime binding to Stanford CoreNLP.
+ * See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK">
+ * Tika NER Wiki</a> for configuring this recogniser.
+ * @see NERecogniser
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+ private final static String USER_AGENT = "Mozilla/5.0";
+ private static boolean available = false;
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add(PERSON);
+ add(TIME);
+ add(LOCATION);
+ add(ORGANIZATION);
+ add(MONEY);
+ add(PERCENT);
+ add(DATE);
+ add(FACILITY);
+ add(GPE);
+ }};
+
+ public NLTKNERecogniser(){
+ try {
+
+ String url = "http://localhost:5000/";
+ HttpClient client = HttpClientBuilder.create().build();
+ HttpGet get = new HttpGet(url);
+
+ // add header
+ get.setHeader("User-Agent", USER_AGENT);
+ HttpResponse response = client.execute(get);
+ int responseCode = response.getStatusLine().getStatusCode();
+ if(responseCode == 200){
+ available = true;
+ }
+ else{
+ LOG.info("NLTKRest Server is not running");
+ }
+
+ } catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ }
+
+
+ /**
+ *
+ * @return {@code true} if model was available, valid and was able to initialise the classifier.
+ * returns {@code false} when this recogniser is not available for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> entities = new HashMap<>();
+ try {
+ String url = "http://localhost:5000/nltk";
+ HttpClient client = HttpClientBuilder.create().build();
+ HttpPost post = new HttpPost(url);
+ // add header
+ post.setHeader("User-Agent", USER_AGENT);
+ List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
+ urlParameters.add(new BasicNameValuePair("text", text));
+ post.setEntity(new UrlEncodedFormEntity(urlParameters));
+
+ HttpResponse response = client.execute(post);
+
+ int responseCode = response.getStatusLine().getStatusCode();
+ if (responseCode == 200) {
+ BufferedReader rd = new BufferedReader(
+ new InputStreamReader(response.getEntity().getContent()));
+
+ String result = rd.readLine();
+
+ JSONParser parser = new JSONParser();
+ JSONObject j = (JSONObject) parser.parse(result);
+ JSONArray aa = new JSONArray();
+ for (Object x : j.keySet()) {
+ aa = (JSONArray) j.get(x.toString());
+ Set s = new HashSet();
+ for (Object y : aa) {
+ s.add(y.toString());
+ }
+ entities.put(x.toString(), s);
+ }
+ }
+ }
+ catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ ENTITY_TYPES.clear();
+ ENTITY_TYPES.addAll(entities.keySet());
+ LOG.info("returning this:" + entities.keySet().toString());
+ return entities;
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..4fbeb42
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,40 @@
+package org.apache.tika.parser.ner.nltk;
+
+/**
+ * Created by manali on 2/1/16.
+ */
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+ @Test
+ public void testGetEntityTypes() throws Exception {
+
+ String text = "America";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+
+ Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
+ if(gpe.size() == 0) return;
+ else {
+ assertTrue(gpe.contains("America"));
+ assertTrue(gpe.size() == 1); //and nothing else
+ }
+ }
+}
[05/20] tika git commit: Update NLTKNERecogniserTest.java
Posted by ma...@apache.org.
Update NLTKNERecogniserTest.java
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/892beca8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/892beca8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/892beca8
Branch: refs/heads/master
Commit: 892beca8b7c76c7849e9a71a0e252fca606753af
Parents: 59ddcaa
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:36:22 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:36:22 2016 -0800
----------------------------------------------------------------------
.../tika/parser/ner/nltk/NLTKNERecogniserTest.java | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/892beca8/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 23a174c..ac04066 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.parser.ner.nltk;
import org.apache.commons.logging.Log;
[08/20] tika git commit: Merge remote-tracking branch
'upstream/master'
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f054bcd1
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f054bcd1
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f054bcd1
Branch: refs/heads/master
Commit: f054bcd1a0a57a730ebc53e7a2cfd4215a7526fb
Parents: 1b14b39 28b9a66
Author: manali <ma...@gmail.com>
Authored: Fri Feb 19 17:37:29 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 19 17:37:29 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 8 +
tika-bundle/pom.xml | 6 +-
.../apache/tika/metadata/TikaMetadataKeys.java | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 17 +-
tika-parsers/pom.xml | 6 +-
.../microsoft/AbstractPOIFSExtractor.java | 16 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 9 +-
.../tika/parser/pot/PooledTimeSeriesParser.java | 7 +
.../tika/parser/rtf/RTFEmbObjHandler.java | 2 +-
.../org/apache/tika/mime/TestMimeTypes.java | 25 +
.../microsoft/POIContainerExtractionTest.java | 36 +
.../parser/microsoft/PowerPointParserTest.java | 4 +-
.../apache/tika/parser/rtf/RTFParserTest.java | 9 +-
.../resources/test-documents/testJS_HTML.js | 91 +
.../test-documents/testPKCS17Sig-v2.xml.p7m | Bin 0 -> 9682 bytes
.../test-documents/testPKCS17Sig-v3.xml.p7m | 305 ++
.../test-documents/testPKCS17Sig-v4.xml.p7m | 1606 +++++++
.../test-documents/testPKCS17Sig.xml.p7m | 4333 ++++++++++++++++++
.../tika/server/resource/TikaResource.java | 14 +-
.../apache/tika/server/StackTraceOffTest.java | 2 -
.../org/apache/tika/server/StackTraceTest.java | 2 -
.../apache/tika/server/TikaResourceTest.java | 12 +
.../services/org.apache.tika.parser.Parser | 16 -
.../testRTF_npeFromWMFInTikaServer.rtf | 235 +
24 files changed, 6713 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/f054bcd1/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/f054bcd1/tika-parsers/pom.xml
----------------------------------------------------------------------
[20/20] tika git commit: Fix merge conflict.
Posted by ma...@apache.org.
Fix merge conflict.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9056894d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9056894d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9056894d
Branch: refs/heads/master
Commit: 9056894da580107d1a5a21b29a0b7042ffa15c42
Parents: 3fbc03c 7c245fa
Author: Chris Mattmann <ma...@apache.org>
Authored: Tue Mar 1 21:41:57 2016 -0800
Committer: Chris Mattmann <ma...@apache.org>
Committed: Tue Mar 1 21:41:57 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 2 +
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 20 ++
.../org/apache/tika/parser/pdf/PDFParser.java | 35 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 36 ++-
.../apache/tika/parser/pdf/XFAExtractor.java | 318 +++++++++++++++++++
.../apache/tika/parser/pdf/PDFParser.properties | 3 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 32 +-
.../testPDF_XFA_govdocs1_258578.pdf | Bin 0 -> 168176 bytes
8 files changed, 442 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/9056894d/CHANGES.txt
----------------------------------------------------------------------
diff --cc CHANGES.txt
index d5bebcd,05d6d76..e6603fa
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@@ -1,8 -1,7 +1,10 @@@
Release 1.13 - ???
+ * Tika now incorporates the Natural Language Toolkit (NLTK) from the
+ Python community as an option for Named Entity Recognition (TIKA-1876).
+
+ * Add support for XFA extraction via Pascal Essiembre (TIKA-1857).
+
* Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861). NOTE: this dependency
is still <scope>provided</scope>. You need to include this dependency
in order to parser sqlite files.
[11/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a13369b0
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a13369b0
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a13369b0
Branch: refs/heads/master
Commit: a13369b098bea09421e35023c131adc092dcb6e4
Parents: 6c595fb
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 18:21:15 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 18:21:15 2016 -0800
----------------------------------------------------------------------
.../org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java | 9 ---------
.../org/apache/tika/parser/ner/nltk/NLTKServer.properties | 2 +-
.../apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java | 2 +-
3 files changed, 2 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index eddddcb..1edfe28 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -45,15 +45,6 @@ public class NLTKNERecogniser implements NERecogniser {
* some common entities identified by NLTK
*/
public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
- add(PERSON);
- add(TIME);
- add(LOCATION);
- add(ORGANIZATION);
- add(MONEY);
- add(PERCENT);
- add(DATE);
- add(FACILITY);
- add(GPE);
add("NAMES");
}};
http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
index 24f5a2e..5909b69 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-nltk.server.url=http://localhost:5000
+nltk.server.url=http://localhost:8881
http://git-wip-us.apache.org/repos/asf/tika/blob/a13369b0/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index a40ec24..5c1307f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -36,7 +36,7 @@ import static org.junit.Assert.assertTrue;
public class NLTKNERecogniserTest {
@Test
public void testGetEntityTypes() throws Exception {
- String text = "America";
+ String text = "America is a big country.";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
Metadata md = new Metadata();
[06/20] tika git commit: Used Apache CXF WebClient
Posted by ma...@apache.org.
Used Apache CXF WebClient
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/14ca3204
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/14ca3204
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/14ca3204
Branch: refs/heads/master
Commit: 14ca32045361918f5dd28c63c9692accbcfa31d5
Parents: 892beca
Author: manali <ma...@gmail.com>
Authored: Sat Feb 6 17:00:38 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Sat Feb 6 17:00:38 2016 -0800
----------------------------------------------------------------------
.../apache/tika/parser/ner/NERecogniser.java | 2 ++
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 34 +++++++-------------
.../parser/ner/nltk/NLTKNERecogniserTest.java | 7 ++--
3 files changed, 18 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
index c4693eb..3bebff2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
@@ -36,6 +36,8 @@ public interface NERecogniser {
String DATE = "DATE";
String PERCENT = "PERCENT";
String MONEY = "MONEY";
+ String FACILITY = "FACILITY";
+ String GPE = "GPE";
/**
* checks if this Named Entity recogniser is available for service
http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index cb152f3..99cde6f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -41,6 +41,13 @@ import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
+import javax.ws.rs.core.Form;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
/**
* This class offers an implementation of {@link NERecogniser} based on
@@ -73,13 +80,8 @@ public class NLTKNERecogniser implements NERecogniser {
public NLTKNERecogniser(){
try {
String url = "http://localhost:5000/";
- HttpClient client = HttpClientBuilder.create().build();
- HttpGet get = new HttpGet(url);
-
- // add header
- get.setHeader("User-Agent", USER_AGENT);
- HttpResponse response = client.execute(get);
- int responseCode = response.getStatusLine().getStatusCode();
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+ int responseCode = response.getStatus();
if(responseCode == 200){
available = true;
}
@@ -118,22 +120,10 @@ public class NLTKNERecogniser implements NERecogniser {
Map<String, Set<String>> entities = new HashMap<>();
try {
String url = "http://localhost:5000/nltk";
- HttpClient client = HttpClientBuilder.create().build();
- HttpPost post = new HttpPost(url);
- post.setHeader("User-Agent", USER_AGENT);
- List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
- urlParameters.add(new BasicNameValuePair("text", text));
- post.setEntity(new UrlEncodedFormEntity(urlParameters));
-
- HttpResponse response = client.execute(post);
-
- int responseCode = response.getStatusLine().getStatusCode();
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+ int responseCode = response.getStatus();
if (responseCode == 200) {
- BufferedReader rd = new BufferedReader(
- new InputStreamReader(response.getEntity().getContent()));
-
- String result = rd.readLine();
-
+ String result = response.readEntity(String.class);
JSONParser parser = new JSONParser();
JSONObject j = (JSONObject) parser.parse(result);
JSONArray aa = new JSONArray();
http://git-wip-us.apache.org/repos/asf/tika/blob/14ca3204/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index ac04066..563e836 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -21,6 +21,7 @@ import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
import org.junit.Test;
import java.io.ByteArrayInputStream;
@@ -34,7 +35,6 @@ import static org.junit.Assert.assertTrue;
public class NLTKNERecogniserTest {
@Test
public void testGetEntityTypes() throws Exception {
-
String text = "America";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
@@ -42,9 +42,10 @@ public class NLTKNERecogniserTest {
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
-
Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
- if(gpe.size() == 0) return;
+ if(gpe.size() == 0) {
+ return;
+ }
else {
assertTrue(gpe.contains("America"));
assertTrue(gpe.size() == 1); //and nothing else
[18/20] tika git commit: resolved conflicts
Posted by ma...@apache.org.
resolved conflicts
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e147de34
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e147de34
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e147de34
Branch: refs/heads/master
Commit: e147de3429b775f5036c78b18a1cb688971be0af
Parents: 0dbd69c cdb684d
Author: manali <ma...@gmail.com>
Authored: Tue Mar 1 01:05:34 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Mar 1 01:05:34 2016 -0800
----------------------------------------------------------------------
.../apache/tika/sax/RichTextContentHandler.java | 58 ++++++++++++++++++++
.../tika/server/RichTextContentHandler.java | 58 --------------------
2 files changed, 58 insertions(+), 58 deletions(-)
----------------------------------------------------------------------
[17/20] tika git commit: updated with changes
Posted by ma...@apache.org.
updated with changes
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0dbd69ce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0dbd69ce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0dbd69ce
Branch: refs/heads/master
Commit: 0dbd69cef5ec603a11d7f5b52f119e3bea1550b5
Parents: 7ebe007
Author: manali <ma...@gmail.com>
Authored: Tue Mar 1 00:59:11 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Mar 1 00:59:11 2016 -0800
----------------------------------------------------------------------
.../tika/sax/RichTextContentHandlerTest.java | 75 ++++++++++++++++++++
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 16 +++--
.../parser/ner/nltk/NLTKNERecogniserTest.java | 2 +-
3 files changed, 86 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
new file mode 100644
index 0000000..257ea38
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Test cases for the {@link RichTextContentHandler} class.
+ */
+public class RichTextContentHandlerTest {
+
+ /**
+ * Test to check img tags are detected and rich text version used.
+ */
+ @Test
+ public void aTagTest() throws Exception {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(
+ new RichTextContentHandler(
+ new OutputStreamWriter(buffer, Charset.defaultCharset())),
+ new Metadata());
+ xhtml.startDocument();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "", "name", "", "value");
+ xhtml.startElement("a", attributes);
+ xhtml.endDocument();
+
+ assertEquals("\n\n\n\n[bookmark: value]", buffer.toString(UTF_8.name()));
+ }
+
+ /**
+ * Test to check a tags are detected and rich text version used.
+ */
+ @Test
+ public void imgTagTest() throws Exception {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(
+ new RichTextContentHandler(
+ new OutputStreamWriter(buffer, Charset.defaultCharset())),
+ new Metadata());
+ xhtml.startDocument();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "", "alt", "", "value");
+ xhtml.startElement("img", attributes);
+ xhtml.endDocument();
+
+ assertEquals("\n\n\n\n[image: value]", buffer.toString(UTF_8.name()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 1edfe28..5407189 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -22,8 +22,13 @@ import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.*;
-import java.util.*;
+import java.io.IOException;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collection;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Properties;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
@@ -41,6 +46,7 @@ public class NLTKNERecogniser implements NERecogniser {
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
private static boolean available = false;
private static final String NLTK_REST_HOST = "http://localhost:8881";
+ private String restHostUrlStr;
/**
* some common entities identified by NLTK
*/
@@ -48,7 +54,7 @@ public class NLTKNERecogniser implements NERecogniser {
add("NAMES");
}};
- String restHostUrlStr;
+
public NLTKNERecogniser(){
try {
@@ -59,8 +65,7 @@ public class NLTKNERecogniser implements NERecogniser {
e.printStackTrace();
}
- if (restHostUrlStr == null
- || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ if (restHostUrlStr == null || restHostUrlStr.equals("")) {
this.restHostUrlStr = NLTK_REST_HOST;
} else {
this.restHostUrlStr = restHostUrlStr;
@@ -115,7 +120,6 @@ public class NLTKNERecogniser implements NERecogniser {
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> entities = new HashMap<>();
try {
- int port = 8881;
String url = restHostUrlStr + "/nltk";
Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
int responseCode = response.getStatus();
http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 5c1307f..94d9a27 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -48,7 +48,7 @@ public class NLTKNERecogniserTest {
}
else {
assertTrue(names.contains("America"));
- assertTrue(names.size() == 1); //and nothing else
+ assertTrue(names.size() == 1);
}
}
}
[16/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cdb684d9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cdb684d9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cdb684d9
Branch: refs/heads/master
Commit: cdb684d9c1b0ebb01a783180f07417760fa04d6f
Parents: 114d0ff
Author: manali <ma...@gmail.com>
Authored: Sat Feb 27 02:10:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Sat Feb 27 02:10:06 2016 -0800
----------------------------------------------------------------------
.../java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/cdb684d9/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 850f4dd..3e881fe 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -68,7 +68,7 @@ public class NLTKNERecogniser implements NERecogniser {
this.restHostUrlStr = restHostUrlStr;
}
//check if nltkrest is running
- Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+ Response response = WebClient.create(restHostUrlStr).get();
int responseCode = response.getStatus();
if(responseCode == 200){
available = true;
@@ -116,7 +116,7 @@ public class NLTKNERecogniser implements NERecogniser {
try {
int port = 8881;
String url = restHostUrlStr + "/nltk";
- Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+ Response response = WebClient.create(url).post(text);
int responseCode = response.getStatus();
if (responseCode == 200) {
String result = response.readEntity(String.class);
[09/20] tika git commit: created NLTK host server properties
Posted by ma...@apache.org.
created NLTK host server properties
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ac4c0b2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ac4c0b2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ac4c0b2c
Branch: refs/heads/master
Commit: ac4c0b2c9321bba395b92214a3504d8346c3e936
Parents: f054bcd
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:23:26 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:23:26 2016 -0800
----------------------------------------------------------------------
tika-parsers/pom.xml | 15 ++--
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 72 ++++++++++----------
.../tika/parser/ner/nltk/NLTKServer.properties | 16 +++++
.../parser/ner/nltk/NLTKNERecogniserTest.java | 8 +--
4 files changed, 63 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 41daf4d..088a6e9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,13 +88,6 @@
<version>2.1.1</version>
</dependency>
-
- <!-- manali added this-->
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <version>4.5.1</version>
- </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
<groupId>org.apache.felix</groupId>
@@ -366,6 +359,14 @@
<version>3.2.2</version>
<scope>provided</scope>
</dependency>
+
+ <!--Jackson parse String to JSON-->
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>2.7.1</version>
+ </dependency>
+
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 99cde6f..eddddcb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -16,38 +16,18 @@
*/
package org.apache.tika.parser.ner.nltk;
-import org.apache.http.client.methods.HttpGet;
import org.apache.tika.parser.ner.NERecogniser;
-import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.message.BasicNameValuePair;
-
-import javax.ws.rs.core.Form;
+import java.io.*;
+import java.util.*;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
-import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
/**
* This class offers an implementation of {@link NERecogniser} based on
@@ -59,9 +39,8 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
public class NLTKNERecogniser implements NERecogniser {
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
- private final static String USER_AGENT = "Mozilla/5.0";
private static boolean available = false;
-
+ private static final String NLTK_REST_HOST = "http://localhost:8881";
/**
* some common entities identified by NLTK
*/
@@ -75,12 +54,31 @@ public class NLTKNERecogniser implements NERecogniser {
add(DATE);
add(FACILITY);
add(GPE);
+ add("NAMES");
}};
+ String restHostUrlStr;
public NLTKNERecogniser(){
try {
- String url = "http://localhost:5000/";
- Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+
+ String restHostUrlStr="";
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = NLTK_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+
+
+
+
+ Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
int responseCode = response.getStatus();
if(responseCode == 200){
available = true;
@@ -94,6 +92,13 @@ public class NLTKNERecogniser implements NERecogniser {
}
}
+ private static String readRestUrl() throws IOException {
+ Properties nltkProperties = new Properties();
+ nltkProperties.load(NLTKNERecogniser.class
+ .getResourceAsStream("NLTKServer.properties"));
+
+ return nltkProperties.getProperty("nltk.server.url");
+ }
/**
* @return {@code true} if server endpoint is available.
@@ -119,22 +124,15 @@ public class NLTKNERecogniser implements NERecogniser {
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> entities = new HashMap<>();
try {
- String url = "http://localhost:5000/nltk";
- Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+ int port = 8881;
+ String url = restHostUrlStr + "/nltk";
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
int responseCode = response.getStatus();
if (responseCode == 200) {
String result = response.readEntity(String.class);
JSONParser parser = new JSONParser();
JSONObject j = (JSONObject) parser.parse(result);
- JSONArray aa = new JSONArray();
- for (Object x : j.keySet()) {
- aa = (JSONArray) j.get(x.toString());
- Set s = new HashSet();
- for (Object y : aa) {
- s.add(y.toString());
- }
- entities.put(x.toString(), s);
- }
+ Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
}
}
catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..24f5a2e
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nltk.server.url=http://localhost:5000
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 2861051..a40ec24 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -42,13 +42,13 @@ public class NLTKNERecogniserTest {
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
- Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
- if(gpe.size() == 0) {
+ Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+ if(names.size() == 0) {
return;
}
else {
- assertTrue(gpe.contains("America"));
- assertTrue(gpe.size() == 1); //and nothing else
+ assertTrue(names.contains("America"));
+ assertTrue(names.size() == 1); //and nothing else
}
}
}
[14/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a7e24c9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a7e24c9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a7e24c9
Branch: refs/heads/master
Commit: 3a7e24c9a5d77ae41bde0c2106233a2064c5e707
Parents: c809690
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 20:00:05 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 20:00:05 2016 -0800
----------------------------------------------------------------------
.gitignore | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3a7e24c9/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 8093709..40c895f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ target
*.iws
*.bin
nbactions.xml
-nb-configuration.xml*.DS_Store
+nb-configuration.xml
+*.DS_Store
[12/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ebe007e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ebe007e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ebe007e
Branch: refs/heads/master
Commit: 7ebe007ec03088449f67619ef1e6cb564178b14b
Parents: a13369b
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 18:36:02 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 18:36:02 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 2 ++
.../src/main/java/org/apache/tika/mime/MimeType.java | 1 -
.../org/apache/tika/mime/tika-mimetypes.xml | 13 ++++++-------
.../tika/parser/microsoft/ooxml/XWPFListManager.java | 4 ++++
.../org/apache/tika/parser/ner/NERecogniser.java | 2 --
.../apache/tika/server/RichTextContentHandler.java | 15 +++++++++++++--
.../apache/tika/server/resource/TikaResource.java | 2 +-
.../tika/server/resource/UnpackerResource.java | 2 +-
8 files changed, 27 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bb30540..0ffc69f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,8 @@ Release 1.13 - ???
* Upgrade to Jackson 2.7.1 (TIKA-1869).
+ * RichTextContentHandler moved from the Server package to Core (TIKA-1870).
+
Release 1.12 - 01/24/2016
* Support for iFrames and element link extraction is provided in
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index fc520cf..b4d651e 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -270,7 +270,6 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
}
}
-
void addMagic(Magic magic) {
if (magic == null) {
return;
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5bb30fc..95f41e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -38,12 +38,6 @@
-->
<mime-info>
- <mime-type type="application/dicom">
- <_comment>DICOM medical imaging data</_comment>
- <magic priority="50">
- <match value="DICM" type="string" offset="128"/>
- </magic>
- </mime-type>
<mime-type type="application/activemessage"/>
<mime-type type="application/andrew-inset">
<glob pattern="*.ez"/>
@@ -118,7 +112,12 @@
<mime-type type="application/dec-dx"/>
<mime-type type="application/dialog-info+xml"/>
-
+ <mime-type type="application/dicom">
+ <_comment>DICOM medical imaging data</_comment>
+ <magic priority="50">
+ <match value="DICM" type="string" offset="128"/>
+ </magic>
+ </mime-type>
<mime-type type="application/dita+xml">
<sub-class-of type="application/xml"/>
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index 5654378..a938c2f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -57,6 +57,10 @@ public class XWPFListManager extends AbstractListManager {
* @return the formatted number or an empty string if something went wrong
*/
public String getFormattedNumber(final XWPFParagraph paragraph) {
+ if (numbering == null) {
+ return "";
+ }
+
int currNumId = paragraph.getNumID().intValue();
XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
if (xwpfNum == null) {
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
index 3bebff2..c4693eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
@@ -36,8 +36,6 @@ public interface NERecogniser {
String DATE = "DATE";
String PERCENT = "PERCENT";
String MONEY = "MONEY";
- String FACILITY = "FACILITY";
- String GPE = "GPE";
/**
* checks if this Named Entity recogniser is available for service
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
index 81095a7..8fcc4d5 100644
--- a/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
+++ b/tika-server/src/main/java/org/apache/tika/server/RichTextContentHandler.java
@@ -15,15 +15,26 @@
* limitations under the License.
*/
-package org.apache.tika.server;
+package org.apache.tika.sax;
import java.io.Writer;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
+/**
+ * Content handler for Rich Text, it will extract XHTML <img/>
+ * tag <alt/> attribute and XHTML <a/> tag <name/>
+ * attribute into the output.
+ */
public class RichTextContentHandler extends WriteOutContentHandler {
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
public RichTextContentHandler(Writer writer) {
super(writer);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index d74ef74..566203a 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -72,7 +72,7 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
+import org.apache.tika.sax.RichTextContentHandler;
import org.apache.tika.server.TikaServerParseException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
http://git-wip-us.apache.org/repos/asf/tika/blob/7ebe007e/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index cf3a0e9..8ee516e 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -58,7 +58,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
+import org.apache.tika.sax.RichTextContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
[07/20] tika git commit: nltk modification
Posted by ma...@apache.org.
nltk modification
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1b14b39d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1b14b39d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1b14b39d
Branch: refs/heads/master
Commit: 1b14b39d3e1b453620b2f7b26a933103a78c958a
Parents: 14ca320
Author: manali <ma...@gmail.com>
Authored: Fri Feb 19 17:37:25 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 19 17:37:25 2016 -0800
----------------------------------------------------------------------
.../src/main/java/org/apache/tika/mime/MimeType.java | 1 +
.../resources/org/apache/tika/mime/tika-mimetypes.xml | 13 +++++++------
.../tika/parser/ner/nltk/NLTKNERecogniserTest.java | 2 +-
3 files changed, 9 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index b4d651e..fc520cf 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -270,6 +270,7 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
}
}
+
void addMagic(Magic magic) {
if (magic == null) {
return;
http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 1d7b42b..52dd67b 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -38,6 +38,12 @@
-->
<mime-info>
+ <mime-type type="application/dicom">
+ <_comment>DICOM medical imaging data</_comment>
+ <magic priority="50">
+ <match value="DICM" type="string" offset="128"/>
+ </magic>
+ </mime-type>
<mime-type type="application/activemessage"/>
<mime-type type="application/andrew-inset">
<glob pattern="*.ez"/>
@@ -112,12 +118,7 @@
<mime-type type="application/dec-dx"/>
<mime-type type="application/dialog-info+xml"/>
- <mime-type type="application/dicom">
- <_comment>DICOM medical imaging data</_comment>
- <magic priority="50">
- <match value="DICM" type="string" offset="128"/>
- </magic>
- </mime-type>
+
<mime-type type="application/dita+xml">
<sub-class-of type="application/xml"/>
http://git-wip-us.apache.org/repos/asf/tika/blob/1b14b39d/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 563e836..2861051 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -25,6 +25,7 @@ import org.junit.Ignore;
import org.junit.Test;
import java.io.ByteArrayInputStream;
+import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
@@ -37,7 +38,6 @@ public class NLTKNERecogniserTest {
public void testGetEntityTypes() throws Exception {
String text = "America";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
-
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
[10/20] tika git commit: Merge remote-tracking branch
'upstream/master'
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c595fbd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c595fbd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c595fbd
Branch: refs/heads/master
Commit: 6c595fbd3ecefe598cc96b34e2e0baa1f064f6df
Parents: ac4c0b2 08e38bb
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:25:07 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:25:07 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 2 ++
tika-translate/pom.xml | 2 +-
.../language/translate/CachedTranslator.java | 25 ++++++++++++--------
3 files changed, 18 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
[03/20] tika git commit: Update NLTKNERecogniser.java
Posted by ma...@apache.org.
Update NLTKNERecogniser.java
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/db2b4757
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/db2b4757
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/db2b4757
Branch: refs/heads/master
Commit: db2b475733dffca63143551a5f1ddd89d97f0960
Parents: 2b99eea
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:33:17 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:33:17 2016 -0800
----------------------------------------------------------------------
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/db2b4757/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index eb216ea..cb152f3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -44,11 +44,9 @@ import org.apache.http.message.BasicNameValuePair;
/**
* This class offers an implementation of {@link NERecogniser} based on
- * CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
- * due to runtime binding to Stanford CoreNLP.
+ * ne_chunk() module of NLTK. This NER requires additional setup,
+ * due to Http requests to an endpoint server that runs NLTK.
* See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK">
- * Tika NER Wiki</a> for configuring this recogniser.
- * @see NERecogniser
*
*/
public class NLTKNERecogniser implements NERecogniser {
@@ -56,6 +54,10 @@ public class NLTKNERecogniser implements NERecogniser {
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
private final static String USER_AGENT = "Mozilla/5.0";
private static boolean available = false;
+
+ /**
+ * some common entities identified by NLTK
+ */
public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
add(PERSON);
add(TIME);
@@ -70,7 +72,6 @@ public class NLTKNERecogniser implements NERecogniser {
public NLTKNERecogniser(){
try {
-
String url = "http://localhost:5000/";
HttpClient client = HttpClientBuilder.create().build();
HttpGet get = new HttpGet(url);
@@ -93,9 +94,8 @@ public class NLTKNERecogniser implements NERecogniser {
/**
- *
- * @return {@code true} if model was available, valid and was able to initialise the classifier.
- * returns {@code false} when this recogniser is not available for service.
+ * @return {@code true} if server endpoint is available.
+ * returns {@code false} if server endpoint is not avaliable for service.
*/
public boolean isAvailable() {
return available;
@@ -120,7 +120,6 @@ public class NLTKNERecogniser implements NERecogniser {
String url = "http://localhost:5000/nltk";
HttpClient client = HttpClientBuilder.create().build();
HttpPost post = new HttpPost(url);
- // add header
post.setHeader("User-Agent", USER_AGENT);
List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
urlParameters.add(new BasicNameValuePair("text", text));
@@ -153,7 +152,6 @@ public class NLTKNERecogniser implements NERecogniser {
}
ENTITY_TYPES.clear();
ENTITY_TYPES.addAll(entities.keySet());
- LOG.info("returning this:" + entities.keySet().toString());
return entities;
}
[02/20] tika git commit: Merge remote-tracking branch
'upstream/master' Integrated NLTK into Tika Parsers by using endpoint as
NLTKRest
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master'
Integrated NLTK into Tika Parsers by using endpoint as NLTKRest
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2b99eeab
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2b99eeab
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2b99eeab
Branch: refs/heads/master
Commit: 2b99eeab8bb4c8aaaf13202904910298dc446c05
Parents: d685742 256209a
Author: manali <ma...@gmail.com>
Authored: Tue Feb 2 00:18:16 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Tue Feb 2 00:18:16 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 4 ++++
tika-parsers/pom.xml | 2 +-
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 ++------
3 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/2b99eeab/tika-parsers/pom.xml
----------------------------------------------------------------------
[15/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/114d0ff2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/114d0ff2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/114d0ff2
Branch: refs/heads/master
Commit: 114d0ff24bd04395852012a3382d50c3e906e6db
Parents: 3a7e24c
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 20:06:20 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 20:06:20 2016 -0800
----------------------------------------------------------------------
tika-parsers/pom.xml | 8 ++++++++
1 file changed, 8 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/114d0ff2/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 885bbc2..4f92b1b 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -359,6 +359,14 @@
<version>3.2.2</version>
<scope>provided</scope>
</dependency>
+
+ <!--Jackson parse String to JSON-->
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>2.7.1</version>
+ </dependency>
+
</dependencies>
<build>
[04/20] tika git commit: Update NLTKNERecogniserTest.java
Posted by ma...@apache.org.
Update NLTKNERecogniserTest.java
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/59ddcaac
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/59ddcaac
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/59ddcaac
Branch: refs/heads/master
Commit: 59ddcaac5f479b4485d35c061e81dfa612773aab
Parents: db2b475
Author: Manali Shah <ma...@usc.edu>
Authored: Tue Feb 2 00:35:41 2016 -0800
Committer: Manali Shah <ma...@usc.edu>
Committed: Tue Feb 2 00:35:41 2016 -0800
----------------------------------------------------------------------
.../org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java | 3 ---
1 file changed, 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/59ddcaac/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 4fbeb42..23a174c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -1,8 +1,5 @@
package org.apache.tika.parser.ner.nltk;
-/**
- * Created by manali on 2/1/16.
- */
import org.apache.commons.logging.Log;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
[13/20] tika git commit: fix for TIKA-1876 contributed by manalishah
Posted by ma...@apache.org.
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c809690e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c809690e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c809690e
Branch: refs/heads/master
Commit: c809690ec87ffa600018dbc5eee6d6756645adb0
Parents: ed762b7
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 19:58:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 19:58:06 2016 -0800
----------------------------------------------------------------------
.gitignore | 2 +-
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 137 +++++++++++++++++++
.../tika/parser/ner/nltk/NLTKServer.properties | 16 +++
.../parser/ner/nltk/NLTKNERecogniserTest.java | 54 ++++++++
4 files changed, 208 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..8093709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ target
*.iws
*.bin
nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml*.DS_Store
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..850f4dd
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.*;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * ne_chunk() module of NLTK. This NER requires additional setup,
+ * due to Http requests to an endpoint server that runs NLTK.
+ * This endpoint has been implemented as pip/setuptools installable python module
+ * See <a href="https://github.com/manalishah/NLTKRest"></a>
+ * See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+ private static boolean available = false;
+ private static final String NLTK_REST_HOST = "http://localhost:8881";
+ /**
+ * some common entities identified by NLTK
+ */
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add("NAMES");
+ }};
+
+ String restHostUrlStr;
+ public NLTKNERecogniser(){
+ try {
+
+ String restHostUrlStr="";
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = NLTK_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+ //check if nltkrest is running
+ Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+ int responseCode = response.getStatus();
+ if(responseCode == 200){
+ available = true;
+ }
+ else{
+ LOG.info("NLTKRest Server is not running");
+ }
+
+ } catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ }
+
+ private static String readRestUrl() throws IOException {
+ Properties nltkProperties = new Properties();
+ nltkProperties.load(NLTKNERecogniser.class
+ .getResourceAsStream("NLTKServer.properties"));
+
+ return nltkProperties.getProperty("nltk.server.url");
+ }
+
+ /**
+ * @return {@code true} if server endpoint is available.
+ * returns {@code false} if server endpoint is not avaliable for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> entities = new HashMap<>();
+ try {
+ int port = 8881;
+ String url = restHostUrlStr + "/nltk";
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+ int responseCode = response.getStatus();
+ if (responseCode == 200) {
+ String result = response.readEntity(String.class);
+ JSONParser parser = new JSONParser();
+ JSONObject j = (JSONObject) parser.parse(result);
+ Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
+ }
+ }
+ catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ ENTITY_TYPES.clear();
+ ENTITY_TYPES.addAll(entities.keySet());
+ return entities;
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..5909b69
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nltk.server.url=http://localhost:8881
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..5c1307f
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+ @Test
+ public void testGetEntityTypes() throws Exception {
+ String text = "America is a big country.";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+ Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+ if(names.size() == 0) {
+ return;
+ }
+ else {
+ assertTrue(names.contains("America"));
+ assertTrue(names.size() == 1); //and nothing else
+ }
+ }
+}
[19/20] tika git commit: Fix for TIKA-1876 Integrate Natural Language
Toolkit (NLTK) into Tika to perform Named Entity Recognition contributed by
Manali Shah this closes #80
Posted by ma...@apache.org.
Fix for TIKA-1876 Integrate Natural Language Toolkit (NLTK) into Tika to perform Named Entity Recognition contributed by Manali Shah <ma...@gmail.com> this closes #80
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3fbc03ce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3fbc03ce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3fbc03ce
Branch: refs/heads/master
Commit: 3fbc03cead1c54bd023a19e52e31609b51926d7d
Parents: e147de3
Author: Chris Mattmann <ma...@apache.org>
Authored: Tue Mar 1 21:41:07 2016 -0800
Committer: Chris Mattmann <ma...@apache.org>
Committed: Tue Mar 1 21:41:07 2016 -0800
----------------------------------------------------------------------
CHANGES.txt | 4 ++++
1 file changed, 4 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3fbc03ce/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 0ffc69f..d5bebcd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,8 @@
Release 1.13 - ???
+
+ * Tika now incorporates the Natural Language Toolkit (NLTK) from the
+ Python community as an option for Named Entity Recognition (TIKA-1876).
+
* Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861). NOTE: this dependency
is still <scope>provided</scope>. You need to include this dependency
in order to parser sqlite files.