You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/08/14 07:44:51 UTC

svn commit: r1695816 - in /tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/journal/ src/main/resources/META-INF/services/ src/main/resources/org/apache/tika/parser/journal/ src/test/java/org/apache/tika/parser/journal/ src/test/resourc...

Author: mattmann
Date: Fri Aug 14 05:44:50 2015
New Revision: 1695816

URL: http://svn.apache.org/r1695816
Log:
- fix for TIKA-1699: Integrate the GROBID PDF extractor in Tika contributed by Sujen Shah <su...@gmail.com> this closes #55.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidConfig.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidHeaderMetadata.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf   (with props)
Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1695816&r1=1695815&r2=1695816&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Fri Aug 14 05:44:50 2015
@@ -232,6 +232,14 @@
       <version>0.7</version>
     </dependency>
 
+	<!--  GROBID Dependencies -->
+	<dependency>
+	  <groupId>org.grobid</groupId>
+  	  <artifactId>grobid-core</artifactId>
+	  <version>0.3.4</version>
+	</dependency>	
+	
+
     <!-- Provided dependencies -->
     <dependency>
       <groupId>org.xerial</groupId>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidConfig.java?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidConfig.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidConfig.java Fri Aug 14 05:44:50 2015
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Properties;
+
+public class GrobidConfig {
+
+  public static final String GROBID_PREFIX = "grobid:";
+  public static final String HEADER_METADATA_PREFIX = "header_";
+
+  private String grobidHome;
+  private String grobidProperties;
+
+  public GrobidConfig() {
+    init(this.getClass().getResourceAsStream("GrobidExtractor.properties"));
+  }
+
+  private void init(InputStream in) {
+    if (in == null) {
+      return;
+    }
+
+    Properties props = new Properties();
+    try {
+      props.load(in);
+    } catch (IOException e) {
+      e.printStackTrace();
+    } finally {
+      try {
+        in.close();
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
+    }
+
+    setGrobidHome(props.getProperty("grobid.home", getGrobidHome()));
+    setGrobidProperties(props.getProperty("grobid.properties",
+        getGrobidProperties()));
+  }
+
+  public String getGrobidHome() {
+    return grobidHome;
+  }
+
+  public void setGrobidHome(String grobidHome) {
+    this.grobidHome = grobidHome;
+  }
+
+  public String getGrobidProperties() {
+    return grobidProperties;
+  }
+
+  public void setGrobidProperties(String grobidProperties) {
+    this.grobidProperties = grobidProperties;
+  }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidHeaderMetadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidHeaderMetadata.java?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidHeaderMetadata.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidHeaderMetadata.java Fri Aug 14 05:44:50 2015
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.beans.BeanInfo;
+import java.beans.IntrospectionException;
+import java.beans.Introspector;
+import java.beans.PropertyDescriptor;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.grobid.core.data.BiblioItem;
+
+public class GrobidHeaderMetadata {
+
+  private Map<String, String> headerMetadata;
+
+  public void generateHeaderMetada(BiblioItem resHeader) {
+    headerMetadata = new HashMap<String, String>();
+    try {
+      BeanInfo info = Introspector.getBeanInfo(BiblioItem.class);
+
+      for (PropertyDescriptor pd : info.getPropertyDescriptors()) {
+        Method m = pd.getReadMethod();
+        if (m != null) {
+          Object value = m.invoke(resHeader);
+          if (value != null) {
+            headerMetadata.put(GrobidConfig.HEADER_METADATA_PREFIX
+                + m.getName().replace("get", ""), "" + value);
+          }
+
+        }
+      }
+    } catch (IntrospectionException | IllegalAccessException
+        | IllegalArgumentException | InvocationTargetException e) {
+      e.printStackTrace();
+    }
+  }
+
+  public Map<String, String> getHeaderMetadata() {
+    return headerMetadata;
+  }
+
+  public void setHeaderMetadata(Map<String, String> headerMetadata) {
+    this.headerMetadata = headerMetadata;
+  }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidParser.java?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidParser.java Fri Aug 14 05:44:50 2015
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.util.Map.Entry;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.grobid.core.data.BiblioItem;
+import org.grobid.core.engines.Engine;
+import org.grobid.core.factory.GrobidFactory;
+import org.grobid.core.mock.MockContext;
+import org.grobid.core.utilities.GrobidProperties;
+import org.xml.sax.ContentHandler;
+
+public class GrobidParser {
+
+  private static final String GROBID_HOME_UNSET_VALUE = "/path/to/grobid-home";
+  
+  private static final String GROBID_PROPERTIES_UNSET_VALUE = "/path/to/grobid-home/config/grobid.properties";
+  
+  public GrobidParser() {
+
+  }
+
+  public void parse(String filePath, ContentHandler handler, Metadata metadata,
+      ParseContext context) {
+    GrobidConfig gConfig = new GrobidConfig();
+    if (!canRun(gConfig)){
+      return;
+    }
+    
+    try {
+      MockContext.setInitialContext(gConfig.getGrobidHome(),
+          gConfig.getGrobidProperties());
+      GrobidProperties.getInstance();
+
+      Engine engine = GrobidFactory.getInstance().createEngine();
+      BiblioItem resHeader = new BiblioItem();
+      engine.processHeader(filePath, false, resHeader);
+      GrobidHeaderMetadata gheaderMetada = new GrobidHeaderMetadata();
+      gheaderMetada.generateHeaderMetada(resHeader);
+      populateTikaMetadata(gheaderMetada, metadata);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  private void populateTikaMetadata(GrobidHeaderMetadata gheaderMetada,
+      Metadata metadata) {
+    for (Entry<String, String> pair : gheaderMetada.getHeaderMetadata()
+        .entrySet()) {
+      metadata.add(GrobidConfig.GROBID_PREFIX + pair.getKey(), pair.getValue());
+    }
+  }
+  
+  protected static boolean canRun(GrobidConfig gConfig){
+    return  gConfig.getGrobidHome() != null 
+        && !gConfig.getGrobidHome().equals("")
+        && !gConfig.getGrobidHome().equals(GROBID_HOME_UNSET_VALUE)
+        &&  gConfig.getGrobidProperties() != null
+        && !gConfig.getGrobidProperties().equals("")
+        && !gConfig.getGrobidProperties().equals(GROBID_PROPERTIES_UNSET_VALUE);
+  }
+
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java Fri Aug 14 05:44:50 2015
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JournalParser extends AbstractParser {
+
+  /**
+   * Generated serial ID
+   */
+  private static final long serialVersionUID = 4664255544154296438L;
+
+  private static final MediaType TYPE = MediaType.application("pdf");
+
+  private static final Set<MediaType> SUPPORTED_TYPES = Collections
+      .singleton(TYPE);
+
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return SUPPORTED_TYPES;
+  }
+
+  public void parse(InputStream stream, ContentHandler handler,
+      Metadata metadata, ParseContext context) throws IOException,
+      SAXException, TikaException {
+    TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+    File tmpFile = tis.getFile();
+
+    GrobidParser grobidParser = new GrobidParser();
+    grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
+
+    PDFParser parser = new PDFParser();
+    parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
+  }
+}

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1695816&r1=1695815&r2=1695816&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Fri Aug 14 05:44:50 2015
@@ -66,3 +66,4 @@ org.apache.tika.parser.isatab.ISArchiveP
 org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.journal.JournalParser
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Fri Aug 14 05:44:50 2015
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+grobid.home=/path/to/grobid-home
+grobid.properties=/path/to/grobid-home/config/grobid.properties

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1695816&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Fri Aug 14 05:44:50 2015
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.tika.parser.journal.GrobidParser.canRun;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class JournalParserTest {
+
+  @Test
+  public void testJournalParser() {
+    String path = "/test-documents/testJournalParser.pdf";
+    ContentHandler handler = new BodyContentHandler();
+    Metadata metadata = new Metadata();
+    
+    GrobidConfig gConfig = new GrobidConfig();
+    assumeTrue(canRun(gConfig));
+    
+    InputStream stream = JournalParserTest.class.getResourceAsStream(path);
+    JournalParser jParser = new JournalParser();
+    try {
+      jParser.parse(stream, handler, metadata, new ParseContext());
+    } catch (Exception e){
+       e.printStackTrace();
+       fail(e.getMessage());
+    }
+
+    assertNotNull(metadata.get("grobid:header_Title"));
+  }
+}

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf?rev=1695816&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream