You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [4/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-mo...

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,109 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Blob;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Locale;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Concrete class for SQLLite table parsing.  This overrides
+ * column type handling from JDBCRowHandler.
+ * <p/>
+ * This class is not designed to be thread safe (because of DateFormat)!
+ * Need to call a new instance for each parse, as AbstractDBParser does.
+ * <p/>
+ * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
+ * does not currently support them.
+ */
+class SQLite3TableReader extends JDBCTableReader {
+
+
+    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT);
+
+    public SQLite3TableReader(Connection connection, String tableName, ParseContext context) {
+        super(connection, tableName, context);
+    }
+
+
+    /**
+     * No-op for now in {@link SQLite3TableReader}.
+     *
+     * @param tableName
+     * @param fieldName
+     * @param rowNum
+     * @param resultSet
+     * @param columnIndex
+     * @param handler
+     * @param context
+     * @throws java.sql.SQLException
+     * @throws java.io.IOException
+     * @throws org.xml.sax.SAXException
+     */
+    @Override
+    protected void handleClob(String tableName, String fieldName, int rowNum,
+                              ResultSet resultSet, int columnIndex,
+                              ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+        //no-op for now.
+    }
+
+    /**
+     * The jdbc connection to Sqlite does not yet implement blob, have to getBytes().
+     *
+     * @param resultSet   resultSet
+     * @param columnIndex columnIndex for blob
+     * @return
+     * @throws java.sql.SQLException
+     */
+    @Override
+    protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata m) throws SQLException {
+        return TikaInputStream.get(resultSet.getBytes(columnIndex), m);
+    }
+
+    @Override
+    protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex,
+                                 ContentHandler handler) throws SQLException, SAXException {
+        //As of this writing, with xerial's sqlite jdbc connector, a timestamp is
+        //stored as a column of type Integer, but the columnTypeName is TIMESTAMP, and the
+        //value is a string representing a Long.
+        if (columnTypeName.equals("TIMESTAMP")) {
+            addAllCharacters(parseDateFromLongString(rs.getString(columnIndex)), handler);
+        } else {
+            addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler);
+        }
+
+    }
+
+    private String parseDateFromLongString(String longString) throws SAXException {
+        java.sql.Date d = new java.sql.Date(Long.parseLong(longString));
+        return dateFormat.format(d);
+
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+#org.apache.tika.parser.jdbc.SQLite3DBParser
+org.apache.tika.parser.jdbc.SQLite3Parser

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,356 @@
+package org.apache.tika.parser.jdbc;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class SQLite3ParserTest extends TikaTest {
+    private final static String TEST_FILE_NAME = "testSqlite3b.db";
+    private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME;
+
+    @Test
+    public void testBasic() throws Exception {
+        Parser p = new AutoDetectParser();
+
+        //test different types of input streams
+        //actual inputstream, memory buffered bytearray and literal file
+        InputStream[] streams = new InputStream[3];
+        streams[0] = getResourceAsStream(TEST_FILE1);
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
+        streams[1] = new ByteArrayInputStream(bos.toByteArray());
+        streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
+        int tests = 0;
+        for (InputStream stream : streams) {
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+            //1) getXML closes the stream
+            //2) getXML runs recursively on the contents, so the embedded docs should show up
+            XMLResult result = getXML(stream, p, metadata);
+            String x = result.xml;
+            //first table name
+            assertContains("<table name=\"my_table1\"><thead><tr>\t<th>INT_COL</th>", x);
+            //non-ascii
+            assertContains("<td>æ®ææ¯é¡¿å¤§å¦</td>", x);
+            //boolean
+            assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
+            //date test
+            assertContains("2015-01-04", x);
+            //timestamp test
+            assertContains("2015-01-03 15:17:03", x);
+            //first embedded doc's image tag
+            assertContains("alt=\"image1.png\"", x);
+            //second embedded doc's image tag
+            assertContains("alt=\"A description...\"", x);
+            //second table name
+            assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);
+
+            Metadata post = result.metadata;
+            String[] tableNames = post.getValues(Database.TABLE_NAME);
+            assertEquals(2, tableNames.length);
+            assertEquals("my_table1", tableNames[0]);
+            assertEquals("my_table2", tableNames[1]);
+            tests++;
+        }
+        assertEquals(3, tests);
+    }
+
+    //make sure that table cells and rows are properly marked to
+    //yield \t and \n at the appropriate places
+    @Test
+    public void testSpacesInBodyContentHandler() throws Exception {
+        Parser p = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+        ContentHandler handler = new BodyContentHandler(-1);
+        ParseContext ctx = new ParseContext();
+        ctx.set(Parser.class, p);
+        try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
+            p.parse(stream, handler, metadata, ctx);
+        }
+        String s = handler.toString();
+        assertContains("0\t2.3\t2.4\tlorem", s);
+        assertContains("tempor\n", s);
+    }
+
+    //test what happens if the user forgets to pass in a parser via context
+    //to handle embedded documents
+    @Test
+    public void testNotAddingEmbeddedParserToParseContext() throws Exception {
+        Parser p = new AutoDetectParser();
+
+        InputStream is = getResourceAsStream(TEST_FILE1);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+        ContentHandler handler = new ToXMLContentHandler();
+        p.parse(is, handler, metadata, new ParseContext());
+        String xml = handler.toString();
+        //just includes headers for embedded documents
+        assertContains("<table name=\"my_table1\"><thead><tr>", xml);
+        assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
+        //but no other content
+        assertNotContained("dog", xml);
+        assertNotContained("alt=\"image1.png\"", xml);
+        //second embedded doc's image tag
+        assertNotContained("alt=\"A description...\"", xml);
+    }
+
+    @Test
+    public void testRecursiveParserWrapper() throws Exception {
+        Parser p = new AutoDetectParser();
+
+        RecursiveParserWrapper wrapper =
+                new RecursiveParserWrapper(p, new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+        InputStream is = getResourceAsStream(TEST_FILE1);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+        wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
+        List<Metadata> metadataList = wrapper.getMetadata();
+        int i = 0;
+        assertEquals(5, metadataList.size());
+        //make sure the \t are inserted in a body handler
+
+        String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("0\t2.3\t2.4\tlorem", table);
+        assertContains("æ®ææ¯é¡¿å¤§å¦", table);
+
+        //make sure the \n is inserted
+        String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("do eiusmod tempor\n", table2);
+
+        assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        //confirm .doc was added to blob
+        assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+    }
+
+    @Test
+    public void testParserContainerExtractor() throws Exception {
+        //There should be 6 embedded documents:
+        //2x tables -- UTF-8 csv representations of the tables
+        //2x word files, one doc and one docx
+        //2x png files, the same image embedded in each of the doc and docx
+
+        ParserContainerExtractor ex = new ParserContainerExtractor();
+        ByteCopyingHandler byteCopier = new ByteCopyingHandler();
+        InputStream is = getResourceAsStream(TEST_FILE1);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+        ex.extract(TikaInputStream.get(is), ex, byteCopier);
+
+        assertEquals(4, byteCopier.bytes.size());
+        String[] strings = new String[4];
+        for (int i = 1; i < byteCopier.bytes.size(); i++) {
+            byte[] byteArr = byteCopier.bytes.get(i);
+            String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
+            strings[i] = s;
+        }
+        byte[] oleBytes = new byte[]{
+                (byte) -48,
+                (byte) -49,
+                (byte) 17,
+                (byte) -32,
+                (byte) -95,
+                (byte) -79,
+                (byte) 26,
+                (byte) -31,
+                (byte) 0,
+                (byte) 0,
+        };
+        //test OLE
+        for (int i = 0; i < 10; i++) {
+            assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
+        }
+        assertContains("PNG", strings[1]);
+        assertContains("PK", strings[2]);
+        assertContains("PNG", strings[3]);
+    }
+
+    //This confirms that reading the stream twice is not
+    //quadrupling the number of attachments.
+    @Test
+    public void testInputStreamReset() throws Exception {
+        //There should be 8 embedded documents:
+        //4x word files, two docs and two docxs
+        //4x png files, the same image embedded in each of the doc and docx
+
+        ParserContainerExtractor ex = new ParserContainerExtractor();
+        InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
+        InputStream is = getResourceAsStream(TEST_FILE1);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+        ex.extract(TikaInputStream.get(is), ex, byteCopier);
+        is.reset();
+        assertEquals(8, byteCopier.bytes.size());
+    }
+
+
+    public static class InputStreamResettingHandler implements EmbeddedResourceHandler {
+
+        public List<byte[]> bytes = new ArrayList<byte[]>();
+
+        @Override
+        public void handle(String filename, MediaType mediaType,
+                           InputStream stream) {
+            ByteArrayOutputStream os = new ByteArrayOutputStream();
+            if (!stream.markSupported()) {
+                stream = TikaInputStream.get(stream);
+            }
+            stream.mark(1000000);
+            try {
+                IOUtils.copy(stream, os);
+                bytes.add(os.toByteArray());
+                stream.reset();
+                //now try again
+                os.reset();
+                IOUtils.copy(stream, os);
+                bytes.add(os.toByteArray());
+                stream.reset();
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+    }
+
+    //code used for creating the test file
+/*
+    private Connection getConnection(String dbFileName) throws Exception {
+        File testDirectory = new File(this.getClass().getResource("/test-documents").toURI());
+        System.out.println("Writing to: " + testDirectory.getAbsolutePath());
+        File testDB = new File(testDirectory, dbFileName);
+        Connection c = null;
+        try {
+            Class.forName("org.sqlite.JDBC");
+            c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath());
+        } catch ( Exception e ) {
+            System.err.println( e.getClass().getName() + ": " + e.getMessage() );
+            System.exit(0);
+        }
+        return c;
+    }
+
+    @Test
+    public void testCreateDB() throws Exception {
+        Connection c = getConnection("testSQLLite3b.db");
+        Statement st = c.createStatement();
+        String sql = "DROP TABLE if exists my_table1";
+        st.execute(sql);
+        sql = "CREATE TABLE my_table1 (" +
+                "INT_COL INT PRIMARY KEY, "+
+                "FLOAT_COL FLOAT, " +
+                "DOUBLE_COL DOUBLE, " +
+                "CHAR_COL CHAR(30), "+
+                "VARCHAR_COL VARCHAR(30), "+
+                "BOOLEAN_COL BOOLEAN,"+
+                "DATE_COL DATE,"+
+                "TIME_STAMP_COL TIMESTAMP,"+
+                "BYTES_COL BYTES" +
+        ")";
+        st.execute(sql);
+        sql = "insert into my_table1 (INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " +
+                "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, BYTES_COL) " +
+                "values (?,?,?,?,?,?,?,?,?)";
+        SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+        java.util.Date d = f.parse("2015-01-03 15:17:03");
+        System.out.println(d.getTime());
+        long d1Long = 1420229823000L;// 2015-01-02 15:17:03
+        long d2Long = 1420316223000L;// 2015-01-03 15:17:03
+        PreparedStatement ps = c.prepareStatement(sql);
+        ps.setInt(1, 0);
+        ps.setFloat(2, 2.3f);
+        ps.setDouble(3, 2.4d);
+        ps.setString(4, "lorem");
+        ps.setString(5, "æ®ææ¯é¡¿å¤§å¦");
+        ps.setBoolean(6, true);
+        ps.setString(7, "2015-01-02");
+        ps.setString(8, "2015-01-03 15:17:03");
+//        ps.setClob(9, new StringReader(clobString));
+        ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox"
+        ps.executeUpdate();
+        ps.clearParameters();
+
+        ps.setInt(1, 1);
+        ps.setFloat(2, 4.6f);
+        ps.setDouble(3, 4.8d);
+        ps.setString(4, "dolor");
+        ps.setString(5, "sit");
+        ps.setBoolean(6, false);
+        ps.setString(7, "2015-01-04");
+        ps.setString(8, "2015-01-03 15:17:03");
+        //ps.setClob(9, new StringReader("consectetur adipiscing elit"));
+        ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!"
+
+        ps.executeUpdate();
+
+        //build table2
+        sql = "DROP TABLE if exists my_table2";
+        st.execute(sql);
+
+        sql = "CREATE TABLE my_table2 (" +
+                "INT_COL2 INT PRIMARY KEY, "+
+                "VARCHAR_COL2 VARCHAR(64))";
+        st.execute(sql);
+        sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')";
+        st.execute(sql);
+        sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')";
+        st.execute(sql);
+
+        c.close();
+    }
+
+    private byte[] getByteArray(InputStream is) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        byte[] buff = new byte[1024];
+        for (int bytesRead; (bytesRead = is.read(buff)) != -1;) {
+            bos.write(buff, 0, bytesRead);
+        }
+        return bos.toByteArray();
+    }
+
+*/
+
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-ebook-parser-module</artifactId>
+  <name>Apache Tika e-Book Parser Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-parser-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import javax.xml.XMLConstants;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+
+/**
+ * Parser for EPUB OPS <code>*.html</code> files.
+ *
+ * For the time being, assume XHTML (TODO: DTBook)
+ */
+public class EpubContentParser extends AbstractParser {
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.emptySet(); // not a top-level parser
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        final XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler,metadata);
+
+        try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            factory.setValidating(false);
+            factory.setNamespaceAware(true);
+            try {
+                factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+            } catch (SAXNotRecognizedException e) {
+                // TIKA-329: Some XML parsers do not support the secure-processing
+                // feature, even though it's required by JAXP in Java 5. Ignoring
+                // the exception is fine here, deployments without this feature
+                // are inherently vulnerable to XML denial-of-service attacks.
+            }
+            SAXParser parser = factory.newSAXParser();
+            parser.parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(xhtml));
+        } catch (ParserConfigurationException e) {
+            throw new TikaException("XML parser configuration error", e);
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.xml.DcXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Epub parser
+ */
+public class EpubParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 215176772484050550L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+            		MediaType.application("epub+zip"),
+                  MediaType.application("x-ibooks+zip")
+            )));
+
+    private Parser meta = new DcXMLParser();
+
+    private Parser content = new EpubContentParser();
+
+    public Parser getMetaParser() {
+        return meta;
+    }
+
+    public void setMetaParser(Parser meta) {
+        this.meta = meta;
+    }
+
+    public Parser getContentParser() {
+        return content;
+    }
+
+    public void setContentParser(Parser content) {
+        this.content = content;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Because an EPub file is often made up of multiple XHTML files,
+        //  we need explicit control over the start and end of the document
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        ContentHandler childHandler = new EmbeddedContentHandler(
+              new BodyContentHandler(xhtml));
+       
+        ZipInputStream zip = new ZipInputStream(stream);
+        ZipEntry entry = zip.getNextEntry();
+        while (entry != null) {
+            if (entry.getName().equals("mimetype")) {
+                String type = IOUtils.toString(zip, UTF_8);
+                metadata.set(Metadata.CONTENT_TYPE, type);
+            } else if (entry.getName().equals("metadata.xml")) {
+                meta.parse(zip, new DefaultHandler(), metadata, context);
+            } else if (entry.getName().endsWith(".opf")) {
+                meta.parse(zip, new DefaultHandler(), metadata, context);
+            } else if (entry.getName().endsWith(".html") || 
+            		   entry.getName().endsWith(".xhtml")) {
+                content.parse(zip, childHandler, metadata, context);
+            }
+            entry = zip.getNextEntry();
+        }
+        
+        // Finish everything
+        xhtml.endDocument();
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.epub.EpubParser

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EpubParserTest {
+
+    @Test
+    public void testXMLParser() throws Exception {
+        try (InputStream input = EpubParserTest.class.getResourceAsStream(
+                "/test-documents/testEPUB.epub")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("application/epub+zip",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("en",
+                    metadata.get(TikaCoreProperties.LANGUAGE));
+            assertEquals("This is an ePub test publication for Tika.",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Apache",
+                    metadata.get(TikaCoreProperties.PUBLISHER));
+
+            String content = handler.toString();
+            assertContains("Plus a simple div", content);
+            assertContains("First item", content);
+            assertContains("The previous headings were subchapters", content);
+            assertContains("Table data", content);
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-parser-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ibooks;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.epub.EpubParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class iBooksParserTest {
+
+    @Test
+    public void testiBooksParser() throws Exception {
+        try (InputStream input = iBooksParserTest.class.getResourceAsStream(
+                "/test-documents/testiBooks.ibooks")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("application/x-ibooks+zip",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("en-GB",
+                    metadata.get(TikaCoreProperties.LANGUAGE));
+            assertEquals("iBooks Author v1.0",
+                    metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("Apache",
+                    metadata.get(TikaCoreProperties.CREATOR));
+
+            /* TODO For some reason, the xhtml files in iBooks-style ePub are not parsed properly, and the content comes back empty.git che
+            String content = handler.toString();
+            System.out.println("content="+content);
+            assertContains("Plus a simple div", content);
+            assertContains("First item", content);
+            assertContains("The previous headings were subchapters", content);
+            assertContains("Table data", content);
+            assertContains("Lorem ipsum dolor rutur amet", content);
+            */
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-journal-parser-module</artifactId>
+  <name>Apache Tika Journal Parser Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <cxf.version>3.0.3</cxf.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.cxf</groupId>
+      <artifactId>cxf-rt-rs-client</artifactId>
+      <version>${cxf.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.json</groupId>
+      <artifactId>json</artifactId>
+      <version>20140107</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-pdf-parser-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-parser-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Properties;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+  private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+  private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+                                                               // doesn't work
+                                                               // nfc why
+
+  private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+
+  private String restHostUrlStr;
+
+  public GrobidRESTParser() {
+    String restHostUrlStr = null;
+    try {
+      restHostUrlStr = readRestUrl();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    if (restHostUrlStr == null
+        || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+      this.restHostUrlStr = GROBID_REST_HOST;
+    } else {
+      this.restHostUrlStr = restHostUrlStr;
+    }
+  }
+
+  public void parse(String filePath, ContentHandler handler, Metadata metadata,
+      ParseContext context) throws FileNotFoundException {
+
+    File pdfFile = new File(filePath);
+    ContentDisposition cd = new ContentDisposition(
+        "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+    Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+    MultipartBody body = new MultipartBody(att);
+
+    Response response = WebClient
+        .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+        .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+        .post(body);
+
+    try {
+      String resp = response.readEntity(String.class);
+      Metadata teiMet = new TEIParser().parse(resp);
+      for (String key : teiMet.names()) {
+        metadata.add("grobid:header_" + key, teiMet.get(key));
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  private static String readRestUrl() throws IOException {
+    Properties grobidProperties = new Properties();
+    grobidProperties.load(GrobidRESTParser.class
+        .getResourceAsStream("GrobidExtractor.properties"));
+
+    return grobidProperties.getProperty("grobid.server.url");
+  }
+
+  protected static boolean canRun() {
+    Response response = null;
+
+    try {
+      response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
+          .accept(MediaType.TEXT_HTML).get();
+      String resp = response.readEntity(String.class);
+      return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+    } catch (Exception e) {
+      e.printStackTrace();
+      return false;
+    }
+  }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JournalParser extends AbstractParser {
+
+  /**
+   * Generated serial ID
+   */
+  private static final long serialVersionUID = 4664255544154296438L;
+
+  private static final MediaType TYPE = MediaType.application("pdf");
+
+  private static final Set<MediaType> SUPPORTED_TYPES = Collections
+      .singleton(TYPE);
+
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return SUPPORTED_TYPES;
+  }
+
+  public void parse(InputStream stream, ContentHandler handler,
+      Metadata metadata, ParseContext context) throws IOException,
+      SAXException, TikaException {
+    TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+    File tmpFile = tis.getFile();
+
+    GrobidRESTParser grobidParser = new GrobidRESTParser();
+    grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
+
+    PDFParser parser = new PDFParser();
+    parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
+  }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.json.XML;
+
+public class TEIParser {
+
+  public TEIParser() {
+  }
+
+  public Metadata parse(String source) {
+    JSONObject obj = XML.toJSONObject(source);
+    Metadata metadata = new Metadata();
+    createGrobidMetadata(source, obj, metadata);
+    return metadata;
+  }
+
+  private void createGrobidMetadata(String source, JSONObject obj,
+      Metadata metadata) {
+    if (obj != null) {
+      JSONObject teiHeader = obj.getJSONObject("TEI")
+          .getJSONObject("teiHeader");
+      if (teiHeader.has("text")) {
+        parseText(teiHeader.getJSONObject("text"), metadata);
+      }
+
+      if (teiHeader.has("fileDesc")) {
+        parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata);
+
+      }
+      if (teiHeader.has("profileDesc")) {
+        parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata);
+      }
+    }
+
+    addStaticMet(source, obj, metadata);
+  }
+
+  private void addStaticMet(String source, JSONObject obj, Metadata metadata) {
+    metadata.add("Class", Metadata.class.getName());
+    metadata.add("TEIJSONSource", obj.toString());
+    metadata.add("TEIXMLSource", source);
+  }
+
+  private void parseText(JSONObject text, Metadata metadata) {
+    if (text.has("xml:lang")) {
+      metadata.add("Language", text.getString("xml:lang"));
+    }
+  }
+
+  private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
+    if (fileDesc.has("titleStmt")) {
+      parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+    }
+
+    if (fileDesc.has("sourceDesc")) {
+      parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+    }
+  }
+
+  private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
+    if (titleStmt.has("title")) {
+      JSONObject title = titleStmt.getJSONObject("title");
+      if (title.has("content")) {
+        metadata.add("Title", title.getString("content"));
+      }
+    }
+  }
+
+  private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
+    if (sourceDesc.has("biblStruct")) {
+      parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+    }
+  }
+
+  private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
+    if (biblStruct.has("analytic")
+        && biblStruct.get("analytic") instanceof JSONObject) {
+      JSONObject analytic = biblStruct.getJSONObject("analytic");
+      if (analytic.has("author")) {
+        Object authorObj = analytic.get("author");
+
+        List<Author> authorList = new ArrayList<Author>();
+        if (authorObj instanceof JSONObject) {
+          parseAuthor((JSONObject) authorObj, authorList);
+        } else if (authorObj instanceof JSONArray) {
+          JSONArray authors = (JSONArray) authorObj;
+          if (authors.length() > 0) {
+            for (int i = 0; i < authors.length(); i++) {
+              JSONObject author = authors.getJSONObject(i);
+              parseAuthor(author, authorList);
+            }
+          }
+
+          metadata.add("Address", getMetadataAddresses(authorList));
+          metadata.add("Affiliation", getMetadataAffiliations(authorList));
+          metadata.add("Authors", getMetadataAuthors(authorList));
+          metadata.add("FullAffiliations",
+              getMetadataFullAffiliations(authorList));
+        }
+
+      }
+    } else {
+      metadata.add("Error", "Unable to parse: no analytic section in JSON");
+    }
+
+  }
+
+  private String getMetadataFullAffiliations(List<Author> authorList) {
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAffils = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+    metAffils.append("[");
+    for (Affiliation af : unique) {
+      metAffils.append(af.toString());
+      metAffils.append(",");
+    }
+    metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+    metAffils.append("]");
+    return metAffils.toString();
+  }
+
+  private String getMetadataAuthors(List<Author> authorList) {
+    // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+    // Steve Hughes 1
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAuthors = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+
+    for (Author a : authorList) {
+      metAuthors.append(printOrBlank(a.getFirstName()));
+      metAuthors.append(printOrBlank(a.getMiddleName()));
+      metAuthors.append(printOrBlank(a.getSurName()));
+
+      StringBuilder affilBuilder = new StringBuilder();
+      for (int idx = 0; idx < unique.size(); idx++) {
+        Affiliation af = unique.get(idx);
+        if (a.getAffiliations().contains(af)) {
+          affilBuilder.append((idx + 1));
+          affilBuilder.append(",");
+        }
+      }
+
+      if (affilBuilder.length() > 0)
+        affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+      metAuthors.append(affilBuilder.toString());
+      metAuthors.append(" ");
+    }
+
+    return metAuthors.toString();
+  }
+
+  private String getMetadataAffiliations(List<Author> authorList) {
+    // generates 1 Jet Propulsion Laboratory California Institute of Technology
+    // ; 2 Computer Science Department University of Southern California
+    List<Affiliation> unique = new ArrayList<Affiliation>();
+    StringBuilder metAffil = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af)) {
+          unique.add(af);
+        }
+      }
+    }
+
+    int count = 1;
+    for (Affiliation a : unique) {
+      metAffil.append(count);
+      metAffil.append(" ");
+      metAffil.append(a.getOrgName().toString());
+      metAffil.deleteCharAt(metAffil.length() - 1);
+      metAffil.append("; ");
+      count++;
+    }
+
+    if (count > 1) {
+      metAffil.deleteCharAt(metAffil.length() - 1);
+      metAffil.deleteCharAt(metAffil.length() - 1);
+    }
+
+    return metAffil.toString();
+  }
+
+  private String getMetadataAddresses(List<Author> authorList) {
+    // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+    List<Address> unique = new ArrayList<Address>();
+    StringBuilder metAddress = new StringBuilder();
+
+    for (Author a : authorList) {
+      for (Affiliation af : a.getAffiliations()) {
+        if (!unique.contains(af.getAddress())) {
+          unique.add(af.getAddress());
+        }
+      }
+    }
+
+    for (Address ad : unique) {
+      metAddress.append(ad.toString());
+      metAddress.append(" ");
+    }
+
+    return metAddress.toString();
+  }
+
+  private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+    Author author = new Author();
+
+    if (authorObj.has("persName")) {
+      JSONObject persName = authorObj.getJSONObject("persName");
+
+      if (persName.has("forename")) {
+
+        Object foreNameObj = persName.get("forename");
+
+        if (foreNameObj instanceof JSONObject) {
+          parseNamePart((JSONObject) foreNameObj, author);
+        } else if (foreNameObj instanceof JSONArray) {
+          JSONArray foreName = persName.getJSONArray("forename");
+
+          if (foreName.length() > 0) {
+            for (int i = 0; i < foreName.length(); i++) {
+              JSONObject namePart = foreName.getJSONObject(i);
+              parseNamePart(namePart, author);
+            }
+          }
+        }
+      }
+
+      if (persName.has("surname")) {
+        author.setSurName(persName.getString("surname"));
+      }
+
+      if (authorObj.has("affiliation")) {
+        parseAffiliation(authorObj.get("affiliation"), author);
+      }
+
+    }
+
+    authorList.add(author);
+  }
+
+  private void parseNamePart(JSONObject namePart, Author author) {
+    if (namePart.has("type") && namePart.has("content")) {
+      String type = namePart.getString("type");
+      String content = namePart.getString("content");
+
+      if (type.equals("first")) {
+        author.setFirstName(content);
+      }
+
+      if (type.equals("middle")) {
+        author.setMiddleName(content);
+      }
+    }
+  }
+
+  private void parseAffiliation(Object affiliationJSON, Author author) {
+    if (affiliationJSON instanceof JSONObject) {
+      parseOneAffiliation((JSONObject) affiliationJSON, author);
+    } else if (affiliationJSON instanceof JSONArray) {
+      JSONArray affiliationArray = (JSONArray) affiliationJSON;
+      if (affiliationArray != null && affiliationArray.length() > 0) {
+        for (int i = 0; i < affiliationArray.length(); i++) {
+          JSONObject affiliationObj = affiliationArray.getJSONObject(i);
+          parseOneAffiliation(affiliationObj, author);
+        }
+      }
+    }
+  }
+
+  private void parseOneAffiliation(JSONObject affiliationObj, Author author) {
+
+    Affiliation affiliation = new Affiliation();
+    if (affiliationObj.has("address")) {
+      parseAddress(affiliationObj.getJSONObject("address"), affiliation);
+    }
+
+    if (affiliationObj.has("orgName")) {
+      OrgName orgName = new OrgName();
+      Object orgObject = affiliationObj.get("orgName");
+      if (orgObject instanceof JSONObject) {
+        parseOrgName((JSONObject) orgObject, orgName);
+      } else if (orgObject instanceof JSONArray) {
+        JSONArray orgNames = (JSONArray) orgObject;
+        if (orgNames != null && orgNames.length() > 0) {
+          for (int i = 0; i < orgNames.length(); i++) {
+            parseOrgName(orgNames.getJSONObject(i), orgName);
+          }
+        }
+
+        affiliation.setOrgName(orgName);
+      }
+
+    }
+
+    author.getAffiliations().add(affiliation);
+  }
+
+  private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+    Address address = new Address();
+
+    if (addressObj.has("region")) {
+      address.setRegion(addressObj.getString("region"));
+    }
+
+    if (addressObj.has("postCode")) {
+      address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+    }
+
+    if (addressObj.has("settlement")) {
+      address.setSettlment(addressObj.getString("settlement"));
+    }
+
+    if (addressObj.has("country")) {
+      Country country = new Country();
+      Object countryObj = addressObj.get("country");
+
+      if (countryObj instanceof JSONObject) {
+        JSONObject countryJson = addressObj.getJSONObject("country");
+
+        if (countryJson.has("content")) {
+          country.setContent(countryJson.getString("content"));
+        }
+
+        if (countryJson.has("key")) {
+          country.setKey(countryJson.getString("key"));
+        }
+      } else if (countryObj instanceof String) {
+        country.setContent((String) countryObj);
+      }
+      address.setCountry(country);
+    }
+
+    affiliation.setAddress(address);
+  }
+
+  private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+    OrgTypeName typeName = new OrgTypeName();
+    if (orgObj.has("content")) {
+      typeName.setName(orgObj.getString("content"));
+    }
+
+    if (orgObj.has("type")) {
+      typeName.setType(orgObj.getString("type"));
+    }
+
+    orgName.getTypeNames().add(typeName);
+  }
+
+  private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
+    if (profileDesc.has("abstract")) {
+      if (profileDesc.has("p")) {
+        metadata.add("Abstract", profileDesc.getString("p"));
+      }
+    }
+
+    if (profileDesc.has("textClass")) {
+      JSONObject textClass = profileDesc.getJSONObject("textClass");
+
+      if (textClass.has("keywords")) {
+        Object keywordsObj = textClass.get("keywords");
+        // test AJ15.pdf
+        if (keywordsObj instanceof String) {
+          metadata.add("Keyword", (String) keywordsObj);
+        } else if (keywordsObj instanceof JSONObject) {
+          JSONObject keywords = textClass.getJSONObject("keywords");
+          if (keywords.has("term")) {
+            JSONArray termArr = keywords.getJSONArray("term");
+            for (int i = 0; i < termArr.length(); i++) {
+              metadata.add("Keyword", JSONObject.valueToString(termArr.get(i)));
+            }
+          }
+        }
+
+      }
+    }
+
+  }
+
+  private String printOrBlank(String val) {
+    if (val != null && !val.equals("")) {
+      return val + " ";
+    } else
+      return " ";
+  }
+
+  class Author {
+
+    private String surName;
+
+    private String middleName;
+
+    private String firstName;
+
+    private List<Affiliation> affiliations;
+
+    public Author() {
+      this.surName = null;
+      this.middleName = null;
+      this.firstName = null;
+      this.affiliations = new ArrayList<Affiliation>();
+    }
+
+    /**
+     * @return the surName
+     */
+    public String getSurName() {
+      return surName;
+    }
+
+    /**
+     * @param surName
+     *          the surName to set
+     */
+    public void setSurName(String surName) {
+      this.surName = surName;
+    }
+
+    /**
+     * @return the middleName
+     */
+    public String getMiddleName() {
+      return middleName;
+    }
+
+    /**
+     * @param middleName
+     *          the middleName to set
+     */
+    public void setMiddleName(String middleName) {
+      this.middleName = middleName;
+    }
+
+    /**
+     * @return the firstName
+     */
+    public String getFirstName() {
+      return firstName;
+    }
+
+    /**
+     * @param firstName
+     *          the firstName to set
+     */
+    public void setFirstName(String firstName) {
+      this.firstName = firstName;
+    }
+
+    /**
+     * @return the affiliations
+     */
+    public List<Affiliation> getAffiliations() {
+      return affiliations;
+    }
+
+    /**
+     * @param affiliations
+     *          the affiliations to set
+     */
+    public void setAffiliations(List<Affiliation> affiliations) {
+      this.affiliations = affiliations;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+          : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+              + "]";
+    }
+
+  }
+
+  class Affiliation {
+
+    private OrgName orgName;
+
+    private Address address;
+
+    public Affiliation() {
+      this.orgName = new OrgName();
+      this.address = new Address();
+    }
+
+    /**
+     * @return the orgName
+     */
+    public OrgName getOrgName() {
+      return orgName;
+    }
+
+    /**
+     * @param orgName
+     *          the orgName to set
+     */
+    public void setOrgName(OrgName orgName) {
+      this.orgName = orgName;
+    }
+
+    /**
+     * @return the address
+     */
+    public Address getAddress() {
+      return address;
+    }
+
+    /**
+     * @param address
+     *          the address to set
+     */
+    public void setAddress(Address address) {
+      this.address = address;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Affiliation otherA = (Affiliation) obj;
+      return this.getAddress().equals(otherA.getAddress())
+          && this.getOrgName().equals(otherA.getOrgName());
+
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+    }
+
+  }
+
+  class OrgName {
+    private List<OrgTypeName> typeNames;
+
+    public OrgName() {
+      this.typeNames = new ArrayList<OrgTypeName>();
+    }
+
+    /**
+     * @return the typeNames
+     */
+    public List<OrgTypeName> getTypeNames() {
+      return typeNames;
+    }
+
+    /**
+     * @param typeNames
+     *          the typeNames to set
+     */
+    public void setTypeNames(List<OrgTypeName> typeNames) {
+      this.typeNames = typeNames;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      for (OrgTypeName on : this.typeNames) {
+        builder.append(on.getName());
+        builder.append(" ");
+      }
+      return builder.toString();
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      OrgName otherA = (OrgName) obj;
+
+      if (otherA.getTypeNames() != null) {
+        if (this.typeNames == null) {
+          return false;
+        } else {
+          return this.typeNames.size() == otherA.getTypeNames().size();
+        }
+      } else {
+        if (this.typeNames == null) {
+          return true;
+        } else
+          return false;
+      }
+
+    }
+
+  }
+
+  class OrgTypeName {
+    private String name;
+    private String type;
+
+    public OrgTypeName() {
+      this.name = null;
+      this.type = null;
+    }
+
+    /**
+     * @return the name
+     */
+    public String getName() {
+      return name;
+    }
+
+    /**
+     * @param name
+     *          the name to set
+     */
+    public void setName(String name) {
+      this.name = name;
+    }
+
+    /**
+     * @return the type
+     */
+    public String getType() {
+      return type;
+    }
+
+    /**
+     * @param type
+     *          the type to set
+     */
+    public void setType(String type) {
+      this.type = type;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      OrgTypeName otherOrgName = (OrgTypeName) obj;
+      return this.type.equals(otherOrgName.getType())
+          && this.name.equals(otherOrgName.getName());
+    }
+
+  }
+
+  private class Address {
+
+    private String region;
+    private String postCode;
+    private String settlment;
+    private Country country;
+
+    public Address() {
+      this.region = null;
+      this.postCode = null;
+      this.settlment = null;
+      this.country = new Country();
+    }
+
+    /**
+     * @return the region
+     */
+    public String getRegion() {
+      return region;
+    }
+
+    /**
+     * @param region
+     *          the region to set
+     */
+    public void setRegion(String region) {
+      this.region = region;
+    }
+
+    /**
+     * @return the postCode
+     */
+    public String getPostCode() {
+      return postCode;
+    }
+
+    /**
+     * @param postCode
+     *          the postCode to set
+     */
+    public void setPostCode(String postCode) {
+      this.postCode = postCode;
+    }
+
+    /**
+     * @return the settlment
+     */
+    public String getSettlment() {
+      return settlment;
+    }
+
+    /**
+     * @param settlment
+     *          the settlment to set
+     */
+    public void setSettlment(String settlment) {
+      this.settlment = settlment;
+    }
+
+    /**
+     * @return the country
+     */
+    public Country getCountry() {
+      return country;
+    }
+
+    /**
+     * @param country
+     *          the country to set
+     */
+    public void setCountry(Country country) {
+      this.country = country;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Address otherA = (Address) obj;
+      if (this.settlment == null) {
+        return otherA.getSettlment() == null;
+      } else if (this.country == null) {
+        return otherA.getCountry() == null;
+      } else if (this.postCode == null) {
+        return otherA.getPostCode() == null;
+      } else if (this.region == null) {
+        return otherA.getRegion() == null;
+      }
+
+      return this.settlment.equals(otherA.getSettlment())
+          && this.country.equals(otherA.getCountry())
+          && this.postCode.equals(otherA.getPostCode())
+          && this.region.equals(otherA.getRegion());
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append(settlment);
+      builder.append(", ");
+      builder.append(region);
+      builder.append(" ");
+      builder.append(postCode);
+      builder.append(" ");
+      builder.append(country.getContent());
+      return builder.toString();
+    }
+  }
+
+  private class Country {
+    private String key;
+    private String content;
+
+    public Country() {
+      this.key = null;
+      this.content = null;
+    }
+
+    /**
+     * @return the key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * @param key
+     *          the key to set
+     */
+    public void setKey(String key) {
+      this.key = key;
+    }
+
+    /**
+     * @return the content
+     */
+    public String getContent() {
+      return content;
+    }
+
+    /**
+     * @param content
+     *          the content to set
+     */
+    public void setContent(String content) {
+      this.content = content;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see java.lang.Object#equals(java.lang.Object)
+     */
+    @Override
+    public boolean equals(Object obj) {
+      Country otherC = (Country) obj;
+
+      if (this.key == null) {
+        if (otherC.getKey() != null) {
+          return false;
+        } else {
+          if (this.content == null) {
+            if (otherC.getContent() != null) {
+              return false;
+            } else {
+              return true;
+            }
+          } else {
+            return content.equals(otherC.getContent());
+          }
+        }
+      } else {
+        if (this.content == null) {
+          if (otherC.getContent() != null) {
+            return false;
+          } else {
+            return this.key.equals(otherC.getKey());
+          }
+        } else {
+          return this.key.equals(otherC.getKey())
+              && this.content.equals(otherC.getContent());
+        }
+      }
+    }
+
+  }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,19 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+#org.apache.tika.parser.journal.GrobidRESTParser
+org.apache.tika.parser.journal.JournalParser
+#org.apache.tika.parser.journal.TEIParser

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Sat Jan 16 18:23:01 2016
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+grobid.server.url=http://localhost:8080

Added: tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-parser-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.tika.parser.journal.GrobidRESTParser.canRun;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class JournalParserTest {
+
+  @Test
+  public void testJournalParser() {
+    String path = "/test-documents/testJournalParser.pdf";
+    ContentHandler handler = new BodyContentHandler();
+    Metadata metadata = new Metadata();
+    
+    assumeTrue(canRun());
+    
+    InputStream stream = JournalParserTest.class.getResourceAsStream(path);
+    JournalParser jParser = new JournalParser();
+    try {
+      jParser.parse(stream, handler, metadata, new ParseContext());
+    } catch (Exception e){
+       e.printStackTrace();
+       fail(e.getMessage());
+    }
+
+    assertNotNull(metadata.get("grobid:header_Title"));
+  }
+}

Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml?rev=1725014&r1=1725011&r2=1725014&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -19,8 +19,8 @@
     <version>2.0-SNAPSHOT</version>
   </parent>
 
-  <artifactId>tika-multimedia-module</artifactId>
-  <name>Apache Tika Multimedia Module</name>
+  <artifactId>tika-multimedia-parser-module</artifactId>
+  <name>Apache Tika Multimedia Parser Module</name>
   <url>http://tika.apache.org/</url>
   
   <properties>
@@ -105,19 +105,19 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-web-module</artifactId>
+      <artifactId>tika-web-parser-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-pdf-module</artifactId>
+      <artifactId>tika-pdf-parser-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-office-module</artifactId>
+      <artifactId>tika-office-parser-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>