You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/08/17 06:10:12 UTC

svn commit: r1696191 - in /tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/journal/ src/main/resources/META-INF/services/ src/main/resources/org/apache/tika/parser/journal/ src/test/java/org/apache/tika/parser/journal/ src/test/resourc...

Author: mattmann
Date: Mon Aug 17 04:10:11 2015
New Revision: 1696191

URL: http://svn.apache.org/r1696191
Log:
TIKA-1699: refactored GROBID parser to use GROBID rest API. Only introduced 2 deps, CXF client, and also org.json. very small and works great. Thanks to Sujen Shah for his initial work on the GROBID patch.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf   (with props)
Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696191&r1=1696190&r2=1696191&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Aug 17 04:10:11 2015
@@ -45,6 +45,7 @@
     <vorbis.version>0.6</vorbis.version>
     <pdfbox.version>1.8.10</pdfbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
+    <cxf.version>3.0.3</cxf.version>
   </properties>
 
   <dependencies>
@@ -231,6 +232,12 @@
       <artifactId>junrar</artifactId>
       <version>0.7</version>
     </dependency>
+	<dependency>
+      <groupId>org.apache.cxf</groupId>
+      <artifactId>cxf-rt-rs-client</artifactId>
+      <version>${cxf.version}</version>
+    </dependency>
+	
 
     <!-- Provided dependencies -->
     <dependency>
@@ -269,6 +276,13 @@
         </exclusion>
       </exclusions>
     </dependency>
+    
+    <dependency>
+      <groupId>org.json</groupId>
+      <artifactId>json</artifactId>
+      <version>20140107</version>
+    </dependency>
+    
 
     <!-- Test dependencies -->
     <dependency>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+  private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+  private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+                                                               // doesn't work
+                                                               // nfc why
+
+  private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+  
+  private static String restHostUrlStr;
+  
+  public GrobidRESTParser(String restHostUrlStr){
+    if (restHostUrlStr == null){
+      GrobidRESTParser.restHostUrlStr = GROBID_REST_HOST;
+    }
+    else{
+      GrobidRESTParser.restHostUrlStr = restHostUrlStr;
+    }
+  }
+
+  public void parse(String filePath, ContentHandler handler, Metadata metadata,
+      ParseContext context) throws FileNotFoundException {
+
+    File pdfFile = new File(filePath);
+    ContentDisposition cd = new ContentDisposition(
+        "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+    Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+    MultipartBody body = new MultipartBody(att);
+
+    Response response = WebClient
+        .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+        .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+        .post(body);
+
+    try {
+      String resp = response.readEntity(String.class);
+      Metadata teiMet = new TEIParser().parse(resp);
+      for(String key: teiMet.names()){
+        metadata.add("grobid:header_"+key, teiMet.get(key));
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  protected static boolean canRun() {
+    Response response = null;
+
+    try {
+      response = WebClient.create(restHostUrlStr + GROBID_ISALIVE_PATH)
+          .accept(MediaType.TEXT_HTML).get();
+      String resp = response.readEntity(String.class);
+      return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+    } catch (Exception e) {
+      e.printStackTrace();
+      return false;
+    }
+  }
+
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JournalParser extends AbstractParser {
+
+  /**
+   * Generated serial ID
+   */
+  private static final long serialVersionUID = 4664255544154296438L;
+
+  private static final MediaType TYPE = MediaType.application("pdf");
+
+  private static final Set<MediaType> SUPPORTED_TYPES = Collections
+      .singleton(TYPE);
+
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return SUPPORTED_TYPES;
+  }
+
+  public void parse(InputStream stream, ContentHandler handler,
+      Metadata metadata, ParseContext context) throws IOException,
+      SAXException, TikaException {
+    TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+    File tmpFile = tis.getFile();
+
+    Properties grobidProperties = new Properties();
+    grobidProperties.load(JournalParser.class
+        .getResourceAsStream("GrobidExtractor.properties"));
+
+    GrobidRESTParser grobidParser = new GrobidRESTParser(
+        grobidProperties.getProperty("grobid.server.url"));
+    grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
+
+    PDFParser parser = new PDFParser();
+    parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
+  }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.json.XML;
+
+public class TEIParser {
+
+  public TEIParser() {
+  }
+
+  public Metadata parse(String source) {
+    JSONObject obj = XML.toJSONObject(source);
+    Metadata metadata = new Metadata();
+    createGrobidMetadata(source, obj, metadata);
+    return metadata;
+  }
+
+  private void createGrobidMetadata(String source, JSONObject obj,
+      Metadata metadata) {
+    if (obj != null) {
+      JSONObject teiHeader = obj.getJSONObject("TEI")
+          .getJSONObject("teiHeader");
+      if (teiHeader.has("text")) {
+        parseText(teiHeader.getJSONObject("text"), metadata);
+      }
+
+      if (teiHeader.has("fileDesc")) {
+        parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata);
+
+      }
+      if (teiHeader.has("profileDesc")) {
+        parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata);
+      }
+    }
+
+    addStaticMet(source, obj, metadata);
+  }
+
+  private void addStaticMet(String source, JSONObject obj, Metadata metadata) {
+    metadata.add("Class", Metadata.class.getName());
+    metadata.add("TEIJSONSource", obj.toString());
+    metadata.add("TEIXMLSource", source);
+  }
+
+  private void parseText(JSONObject text, Metadata metadata) {
+    if (text.has("xml:lang")) {
+      metadata.add("Language", text.getString("xml:lang"));
+    }
+  }
+
+  private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
+    if (fileDesc.has("titleStmt")) {
+      parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+    }
+
+    if (fileDesc.has("sourceDesc")) {
+      parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+    }
+  }
+
+  private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
+    if (titleStmt.has("title")) {
+      JSONObject title = titleStmt.getJSONObject("title");
+      if (title.has("content")) {
+        metadata.add("Title", title.getString("content"));
+      }
+    }
+  }
+
+  private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
+    if (sourceDesc.has("biblStruct")) {
+      parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+    }
+  }
+
+  private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
+    if (biblStruct.has("analytic")
+        && biblStruct.get("analytic") instanceof JSONObject) {
+      JSONObject analytic = biblStruct.getJSONObject("analytic");
+      if (analytic.has("author")) {
+        Object authorObj = analytic.get("author");
+
+        List<Author> authorList = new ArrayList<Author>();
+        if (authorObj instanceof JSONObject) {
+          parseAuthor((JSONObject) authorObj, authorList);
+        } else if (authorObj instanceof JSONArray) {
+          JSONArray authors = (JSONArray) authorObj;
+          if (authors.length() > 0) {
+            for (int i = 0; i < authors.length(); i++) {
+              JSONObject author = authors.getJSONObject(i);
+              parseAuthor(author, authorList);
+            }
+          }
+
+          metadata.add("Address", getMetadataAddresses(authorList));
+          metadata.add("Affiliation", getMetadataAffiliations(authorList));
+          metadata.add("Authors", getMetadataAuthors(authorList));
+          metadata.add("FullAffiliations",
+              getMetadataFullAffiliations(authorList));
+        }
+
+      }
+    } else {
+      metadata.add("Error", "Unable to parse: no analytic section in JSON");
+    }
+
+  }
+
+  private String getMetadataFullAffiliations(List<Author> authorList) {
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAffils = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+    metAffils.append("[");
+    for (Affiliation af : unique) {
+      metAffils.append(af.toString());
+      metAffils.append(",");
+    }
+    metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+    metAffils.append("]");
+    return metAffils.toString();
+  }
+
+  private String getMetadataAuthors(List<Author> authorList) {
+    // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+    // Steve Hughes 1
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAuthors = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+
+    for (Author a : authorList) {
+      metAuthors.append(printOrBlank(a.getFirstName()));
+      metAuthors.append(printOrBlank(a.getMiddleName()));
+      metAuthors.append(printOrBlank(a.getSurName()));
+
+      StringBuilder affilBuilder = new StringBuilder();
+      for (int idx = 0; idx < unique.size(); idx++) {
+        Affiliation af = unique.get(idx);
+        if (a.getAffiliations().contains(af)) {
+          affilBuilder.append((idx + 1));
+          affilBuilder.append(",");
+        }
+      }
+
+      if (affilBuilder.length() > 0)
+        affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+      metAuthors.append(affilBuilder.toString());
+      metAuthors.append(" ");
+    }
+
+    return metAuthors.toString();
+  }
+
+  private String getMetadataAffiliations(List<Author> authorList) {
+    // generates 1 Jet Propulsion Laboratory California Institute of Technology
+    // ; 2 Computer Science Department University of Southern California
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAffil = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+
+    int count = 1;
+    for (Affiliation a : unique) {
+      metAffil.append(count);
+      metAffil.append(" ");
+      metAffil.append(a.getOrgName().toString());
+      metAffil.deleteCharAt(metAffil.length() - 1);
+      metAffil.append("; ");
+      count++;
+    }
+
+    if (count > 1) {
+      metAffil.deleteCharAt(metAffil.length() - 1);
+      metAffil.deleteCharAt(metAffil.length() - 1);
+    }
+
+    return metAffil.toString();
+  }
+
+  private String getMetadataAddresses(List<Author> authorList) {
+    // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+    List<Address> unique = new ArrayList<Address>();
+    StringBuilder metAddress = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af.getAddress())) {
+          unique.add(af.getAddress());
+        }
+      }
+    }
+
+    for (Address ad : unique) {
+      metAddress.append(ad.toString());
+      metAddress.append(" ");
+    }
+
+    return metAddress.toString();
+  }
+
+  private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+    Author author = new Author();
+
+    if (authorObj.has("persName")) {
+      JSONObject persName = authorObj.getJSONObject("persName");
+
+      if (persName.has("forename")) {
+
+        Object foreNameObj = persName.get("forename");
+
+        if (foreNameObj instanceof JSONObject) {
+          parseNamePart((JSONObject) foreNameObj, author);
+        } else if (foreNameObj instanceof JSONArray) {
+          JSONArray foreName = persName.getJSONArray("forename");
+
+          if (foreName.length() > 0) {
+            for (int i = 0; i < foreName.length(); i++) {
+              JSONObject namePart = foreName.getJSONObject(i);
+              parseNamePart(namePart, author);
+            }
+          }
+        }
+      }
+
+      if (persName.has("surname")) {
+        author.setSurName(persName.getString("surname"));
+      }
+
+      if (authorObj.has("affiliation")) {
+        parseAffiliation(authorObj.get("affiliation"), author);
+      }
+
+    }
+
+    authorList.add(author);
+  }
+
+  private void parseNamePart(JSONObject namePart, Author author) {
+    if (namePart.has("type") && namePart.has("content")) {
+      String type = namePart.getString("type");
+      String content = namePart.getString("content");
+
+      if (type.equals("first")) {
+        author.setFirstName(content);
+      }
+
+      if (type.equals("middle")) {
+        author.setMiddleName(content);
+      }
+    }
+  }
+
+  private void parseAffiliation(Object affiliationJSON, Author author) {
+    if (affiliationJSON instanceof JSONObject) {
+      parseOneAffiliation((JSONObject) affiliationJSON, author);
+    } else if (affiliationJSON instanceof JSONArray) {
+      JSONArray affiliationArray = (JSONArray) affiliationJSON;
+      if (affiliationArray != null && affiliationArray.length() > 0) {
+        for (int i = 0; i < affiliationArray.length(); i++) {
+          JSONObject affiliationObj = affiliationArray.getJSONObject(i);
+          parseOneAffiliation(affiliationObj, author);
+        }
+      }
+    }
+  }
+
+  private void parseOneAffiliation(JSONObject affiliationObj, Author author) {
+
+    Affiliation affiliation = new Affiliation();
+    if (affiliationObj.has("address")) {
+      parseAddress(affiliationObj.getJSONObject("address"), affiliation);
+    }
+
+    if (affiliationObj.has("orgName")) {
+      OrgName orgName = new OrgName();
+      Object orgObject = affiliationObj.get("orgName");
+      if (orgObject instanceof JSONObject) {
+        parseOrgName((JSONObject) orgObject, orgName);
+      } else if (orgObject instanceof JSONArray) {
+        JSONArray orgNames = (JSONArray) orgObject;
+        if (orgNames != null && orgNames.length() > 0) {
+          for (int i = 0; i < orgNames.length(); i++) {
+            parseOrgName(orgNames.getJSONObject(i), orgName);
+          }
+        }
+
+        affiliation.setOrgName(orgName);
+      }
+
+    }
+
+    author.getAffiliations().add(affiliation);
+  }
+
+  private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+    Address address = new Address();
+
+    if (addressObj.has("region")) {
+      address.setRegion(addressObj.getString("region"));
+    }
+
+    if (addressObj.has("postCode")) {
+      address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+    }
+
+    if (addressObj.has("settlement")) {
+      address.setSettlment(addressObj.getString("settlement"));
+    }
+
+    if (addressObj.has("country")) {
+      Country country = new Country();
+      Object countryObj = addressObj.get("country");
+
+      if (countryObj instanceof JSONObject) {
+        JSONObject countryJson = addressObj.getJSONObject("country");
+
+        if (countryJson.has("content")) {
+          country.setContent(countryJson.getString("content"));
+        }
+
+        if (countryJson.has("key")) {
+          country.setKey(countryJson.getString("key"));
+        }
+      } else if (countryObj instanceof String) {
+        country.setContent((String) countryObj);
+      }
+      address.setCountry(country);
+    }
+
+    affiliation.setAddress(address);
+  }
+
+  private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+    OrgTypeName typeName = new OrgTypeName();
+    if (orgObj.has("content")) {
+      typeName.setName(orgObj.getString("content"));
+    }
+
+    if (orgObj.has("type")) {
+      typeName.setType(orgObj.getString("type"));
+    }
+
+    orgName.getTypeNames().add(typeName);
+  }
+
+  private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
+    if (profileDesc.has("abstract")) {
+      if (profileDesc.has("p")) {
+        metadata.add("Abstract", profileDesc.getString("p"));
+      }
+    }
+
+    if (profileDesc.has("textClass")) {
+      JSONObject textClass = profileDesc.getJSONObject("textClass");
+
+      if (textClass.has("keywords")) {
+        Object keywordsObj = textClass.get("keywords");
+        // test AJ15.pdf
+        if (keywordsObj instanceof String) {
+          metadata.add("Keyword", (String) keywordsObj);
+        } else if (keywordsObj instanceof JSONObject) {
+          JSONObject keywords = textClass.getJSONObject("keywords");
+          if (keywords.has("term")) {
+            JSONArray termArr = keywords.getJSONArray("term");
+            for (int i = 0; i < termArr.length(); i++) {
+              metadata.add("Keyword", JSONObject.valueToString(termArr.get(i)));
+            }
+          }
+        }
+
+      }
+    }
+
+  }
+
+  private String printOrBlank(String val) {
+    if (val != null && !val.equals("")) {
+      return val + " ";
+    } else
+      return " ";
+  }
+
+  class Author {
+
+    private String surName;
+
+    private String middleName;
+
+    private String firstName;
+
+    private List<Affiliation> affiliations;
+
+    public Author() {
+      this.surName = null;
+      this.middleName = null;
+      this.firstName = null;
+      this.affiliations = new ArrayList<Affiliation>();
+    }
+
+    /**
+     * @return the surName
+     */
+    public String getSurName() {
+      return surName;
+    }
+
+    /**
+     * @param surName
+     *          the surName to set
+     */
+    public void setSurName(String surName) {
+      this.surName = surName;
+    }
+
+    /**
+     * @return the middleName
+     */
+    public String getMiddleName() {
+      return middleName;
+    }
+
+    /**
+     * @param middleName
+     *          the middleName to set
+     */
+    public void setMiddleName(String middleName) {
+      this.middleName = middleName;
+    }
+
+    /**
+     * @return the firstName
+     */
+    public String getFirstName() {
+      return firstName;
+    }
+
+    /**
+     * @param firstName
+     *          the firstName to set
+     */
+    public void setFirstName(String firstName) {
+      this.firstName = firstName;
+    }
+
+    /**
+     * @return the affiliations
+     */
+    public List<Affiliation> getAffiliations() {
+      return affiliations;
+    }
+
+    /**
+     * @param affiliations
+     *          the affiliations to set
+     */
+    public void setAffiliations(List<Affiliation> affiliations) {
+      this.affiliations = affiliations;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+          : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+              + "]";
+    }
+
+  }
+
+  class Affiliation {
+
+    private OrgName orgName;
+
+    private Address address;
+
+    public Affiliation() {
+      this.orgName = new OrgName();
+      this.address = new Address();
+    }
+
+    /**
+     * @return the orgName
+     */
+    public OrgName getOrgName() {
+      return orgName;
+    }
+
+    /**
+     * @param orgName
+     *          the orgName to set
+     */
+    public void setOrgName(OrgName orgName) {
+      this.orgName = orgName;
+    }
+
+    /**
+     * @return the address
+     */
+    public Address getAddress() {
+      return address;
+    }
+
+    /**
+     * @param address
+     *          the address to set
+     */
+    public void setAddress(Address address) {
+      this.address = address;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Affiliation otherA = (Affiliation) obj;
+      return this.getAddress().equals(otherA.getAddress())
+          && this.getOrgName().equals(otherA.getOrgName());
+
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+    }
+
+  }
+
+  class OrgName {
+    private List<OrgTypeName> typeNames;
+
+    public OrgName() {
+      this.typeNames = new ArrayList<OrgTypeName>();
+    }
+
+    /**
+     * @return the typeNames
+     */
+    public List<OrgTypeName> getTypeNames() {
+      return typeNames;
+    }
+
+    /**
+     * @param typeNames
+     *          the typeNames to set
+     */
+    public void setTypeNames(List<OrgTypeName> typeNames) {
+      this.typeNames = typeNames;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      for (OrgTypeName on : this.typeNames) {
+        builder.append(on.getName());
+        builder.append(" ");
+      }
+      return builder.toString();
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      OrgName otherA = (OrgName) obj;
+
+      if (otherA.getTypeNames() != null) {
+        if (this.typeNames == null) {
+          return false;
+        } else {
+          return this.typeNames.size() == otherA.getTypeNames().size();
+        }
+      } else {
+        if (this.typeNames == null) {
+          return true;
+        } else
+          return false;
+      }
+
+    }
+
+  }
+
+  class OrgTypeName {
+    private String name;
+    private String type;
+
+    public OrgTypeName() {
+      this.name = null;
+      this.type = null;
+    }
+
+    /**
+     * @return the name
+     */
+    public String getName() {
+      return name;
+    }
+
+    /**
+     * @param name
+     *          the name to set
+     */
+    public void setName(String name) {
+      this.name = name;
+    }
+
+    /**
+     * @return the type
+     */
+    public String getType() {
+      return type;
+    }
+
+    /**
+     * @param type
+     *          the type to set
+     */
+    public void setType(String type) {
+      this.type = type;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      OrgTypeName otherOrgName = (OrgTypeName) obj;
+      return this.type.equals(otherOrgName.getType())
+          && this.name.equals(otherOrgName.getName());
+    }
+
+  }
+
+  private class Address {
+
+    private String region;
+    private String postCode;
+    private String settlment;
+    private Country country;
+
+    public Address() {
+      this.region = null;
+      this.postCode = null;
+      this.settlment = null;
+      this.country = new Country();
+    }
+
+    /**
+     * @return the region
+     */
+    public String getRegion() {
+      return region;
+    }
+
+    /**
+     * @param region
+     *          the region to set
+     */
+    public void setRegion(String region) {
+      this.region = region;
+    }
+
+    /**
+     * @return the postCode
+     */
+    public String getPostCode() {
+      return postCode;
+    }
+
+    /**
+     * @param postCode
+     *          the postCode to set
+     */
+    public void setPostCode(String postCode) {
+      this.postCode = postCode;
+    }
+
+    /**
+     * @return the settlment
+     */
+    public String getSettlment() {
+      return settlment;
+    }
+
+    /**
+     * @param settlment
+     *          the settlment to set
+     */
+    public void setSettlment(String settlment) {
+      this.settlment = settlment;
+    }
+
+    /**
+     * @return the country
+     */
+    public Country getCountry() {
+      return country;
+    }
+
+    /**
+     * @param country
+     *          the country to set
+     */
+    public void setCountry(Country country) {
+      this.country = country;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Address otherA = (Address) obj;
+      if (this.settlment == null) {
+        return otherA.getSettlment() == null;
+      } else if (this.country == null) {
+        return otherA.getCountry() == null;
+      } else if (this.postCode == null) {
+        return otherA.getPostCode() == null;
+      } else if (this.region == null) {
+        return otherA.getRegion() == null;
+      }
+
+      return this.settlment.equals(otherA.getSettlment())
+          && this.country.equals(otherA.getCountry())
+          && this.postCode.equals(otherA.getPostCode())
+          && this.region.equals(otherA.getRegion());
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append(settlment);
+      builder.append(", ");
+      builder.append(region);
+      builder.append(" ");
+      builder.append(postCode);
+      builder.append(" ");
+      builder.append(country.getContent());
+      return builder.toString();
+    }
+  }
+
+  private class Country {
+    private String key;
+    private String content;
+
+    public Country() {
+      this.key = null;
+      this.content = null;
+    }
+
+    /**
+     * @return the key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * @param key
+     *          the key to set
+     */
+    public void setKey(String key) {
+      this.key = key;
+    }
+
+    /**
+     * @return the content
+     */
+    public String getContent() {
+      return content;
+    }
+
+    /**
+     * @param content
+     *          the content to set
+     */
+    public void setContent(String content) {
+      this.content = content;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Country otherC = (Country) obj;
+
+      if (this.key == null) {
+        if (otherC.getKey() != null) {
+          return false;
+        } else {
+          if (this.content == null) {
+            if (otherC.getContent() != null) {
+              return false;
+            } else {
+              return true;
+            }
+          } else {
+            return content.equals(otherC.getContent());
+          }
+        }
+      } else {
+        if (this.content == null) {
+          if (otherC.getContent() != null) {
+            return false;
+          } else {
+            return this.key.equals(otherC.getKey());
+          }
+        } else {
+          return this.key.equals(otherC.getKey())
+              && this.content.equals(otherC.getContent());
+        }
+      }
+    }
+
+  }
+}

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1696191&r1=1696190&r2=1696191&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Mon Aug 17 04:10:11 2015
@@ -66,3 +66,4 @@ org.apache.tika.parser.isatab.ISArchiveP
 org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.journal.JournalParser
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Mon Aug 17 04:10:11 2015
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+grobid.server.url=http://localhost:8080

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.tika.parser.journal.GrobidRESTParser.canRun;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class JournalParserTest {
+
+  @Test
+  public void testJournalParser() {
+    String path = "/test-documents/testJournalParser2.pdf";
+    ContentHandler handler = new BodyContentHandler();
+    Metadata metadata = new Metadata();
+    
+    assumeTrue(canRun());
+    
+    InputStream stream = JournalParserTest.class.getResourceAsStream(path);
+    JournalParser jParser = new JournalParser();
+    try {
+      jParser.parse(stream, handler, metadata, new ParseContext());
+    } catch (Exception e){
+       e.printStackTrace();
+       fail(e.getMessage());
+    }
+
+    assertNotNull(metadata.get("grobid:header_Title"));
+  }
+}

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf?rev=1696191&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream