You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 16:37:37 UTC
[tika] 02/02: TIKA-2791 -- add tags/structure to tika-eval

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d838bd7ee0cbb7921a945f0469ebfd2627714a97
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 11:00:37 2018 -0500

    TIKA-2791 -- add tags/structure to tika-eval
---
 .../sax/AbstractRecursiveParserWrapperHandler.java |   4 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |   1 +
 tika-eval/pom.xml                                  |   6 +-
 .../org/apache/tika/eval/AbstractProfiler.java     | 142 +++++++++++++++++----
 .../java/org/apache/tika/eval/ExtractComparer.java |  30 ++++-
 .../java/org/apache/tika/eval/ExtractProfiler.java |  27 +++-
 .../tika/eval/batch/ExtractComparerBuilder.java    |   2 +
 .../tika/eval/batch/ExtractProfilerBuilder.java    |   1 +
 .../main/java/org/apache/tika/eval/db/Cols.java    |  22 +++-
 .../org/apache/tika/eval/io/ExtractReader.java     |  71 +++++++----
 .../apache/tika/eval/util/ContentTagParser.java    |  89 +++++++++++++
 .../org/apache/tika/eval/util/ContentTags.java     |  63 +++++++++
 .../org/apache/tika/eval/SimpleComparerTest.java   | 126 +++++++++++++-----
 .../resources/test-dirs/extractsA/file15_tags.json |  41 ++++++
 .../test-dirs/extractsA/file16_badTags.json        |  41 ++++++
 .../test-dirs/extractsA/file17_tagsOutOfOrder.json |  41 ++++++
 .../resources/test-dirs/extractsB/file15_tags.html |  31 +++++
 .../test-dirs/extractsB/file16_badTags.html        |  31 +++++
 18 files changed, 675 insertions(+), 94 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index d53f18e..55f5c58 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -36,6 +36,10 @@ import java.nio.charset.Charset;
 public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable {
 
     public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
+    /**
+     * Simple class name of the content handler
+     */
+    public final static Property TIKA_CONTENT_HANDLER = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content_handler");
     public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
     public final static Property WRITE_LIMIT_REACHED =
             Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 5faf3a4..408598f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -114,6 +114,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
             String content = handler.toString();
             if (content != null && content.trim().length() > 0 ) {
                 metadata.add(TIKA_CONTENT, content);
+                metadata.add(TIKA_CONTENT_HANDLER, handler.getClass().getSimpleName());
             }
         }
     }
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 262910e..5cf5bed 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,7 +124,11 @@
             <artifactId>poi-scratchpad</artifactId>
             <version>${poi.version}</version>
         </dependency>
-
+        <dependency>
+            <groupId>org.ccil.cowan.tagsoup</groupId>
+            <artifactId>tagsoup</artifactId>
+            <version>1.2.1</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-batch</artifactId>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 0b41acb..aa999dd 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -56,21 +56,25 @@ import org.apache.tika.eval.tokens.CommonTokenResult;
 import org.apache.tika.eval.tokens.TokenCounter;
 import org.apache.tika.eval.tokens.TokenIntPair;
 import org.apache.tika.eval.tokens.TokenStatistics;
+import org.apache.tika.eval.util.ContentTags;
+import org.apache.tika.eval.util.ContentTagParser;
 import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.apache.tika.utils.ExceptionUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
 
 public abstract class AbstractProfiler extends FileResourceConsumer {
 
     private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
 
-
     private static final String[] EXTRACT_EXTENSIONS = {
             ".json",
             ".txt",
@@ -103,14 +107,35 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
     public static final String TRUE = Boolean.toString(true);
     public static final String FALSE = Boolean.toString(false);
+    private static final String ZERO = "0";
 
 
     protected static final AtomicInteger ID = new AtomicInteger();
 
-    private final static String UNKNOWN_EXTENSION = "unk";
+    private static final String UNKNOWN_EXTENSION = "unk";
     //make this configurable
-    private final static String DIGEST_KEY = "X-TIKA:digest:MD5";
-
+    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+
+    private static Map<String, Cols> initTags() {
+        //simplify this mess
+        Map<String, Cols> tmp = new HashMap<>();
+        tmp.put("A", Cols.TAGS_A);
+        tmp.put("DIV", Cols.TAGS_DIV);
+        tmp.put("I", Cols.TAGS_I);
+        tmp.put("IMG", Cols.TAGS_IMG);
+        tmp.put("LI", Cols.TAGS_LI);
+        tmp.put("OL", Cols.TAGS_OL);
+        tmp.put("P", Cols.TAGS_P);
+        tmp.put("TABLE", Cols.TAGS_TABLE);
+        tmp.put("TD", Cols.TAGS_TD);
+        tmp.put("TITLE", Cols.TAGS_TITLE);
+        tmp.put("TR", Cols.TAGS_TR);
+        tmp.put("U", Cols.TAGS_U);
+        tmp.put("UL", Cols.TAGS_UL);
+        return Collections.unmodifiableMap(tmp);
+    }
     private static CommonTokenCountManager commonTokenCountManager;
     private String lastExtractExtension = null;
 
@@ -230,7 +255,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
     }
 
-    protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
+    protected void writeProfileData(EvalFilePaths fps, int i,
+                                    ContentTags contentTags, Metadata m,
                                     String fileId, String containerId,
                                     List<Integer> numAttachments, TableInfo profileTable) {
 
@@ -275,7 +301,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         data.put(Cols.ELAPSED_TIME_MILLIS,
                 getTime(m));
 
-        String content = getContent(m);
+        String content = contentTags.getContent();
         if (content == null || content.trim().length() == 0) {
             data.put(Cols.HAS_CONTENT, FALSE);
         } else {
@@ -331,17 +357,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
      * entered into the content table.
      *
      * @param fileId
-     * @param m
+     * @param contentTags
      * @param fieldName
      * @param contentsTable
      */
-    protected void writeContentData(String fileId, Metadata m,
+    protected void writeContentData(String fileId, ContentTags contentTags,
                                     String fieldName, TableInfo contentsTable) throws IOException {
-        if (m == null) {
+        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
             return;
         }
         Map<Cols, String> data = new HashMap<>();
-        String content = getContent(m, maxContentLength, data);
+        String content = truncateContent(contentTags, maxContentLength, data);
         if (content == null || content.trim().length() == 0) {
             return;
         }
@@ -350,7 +376,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
         data.put(Cols.ID, fileId);
         data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
-        langid(m, data);
+        langid(contentTags, data);
         String langid = data.get(Cols.LANG_ID_1);
         langid = (langid == null) ? "" : langid;
 
@@ -383,7 +409,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
         data.put(Cols.TOKEN_LENGTH_STD_DEV,
                 Double.toString(summStats.getStandardDeviation()));
-        unicodeBlocks(m, data);
+        unicodeBlocks(contentTags, data);
         try {
             writer.writeRow(contentsTable, data);
         } catch (IOException e) {
@@ -391,6 +417,36 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         }
     }
 
+    void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
+        Map<String, Integer> tags = contentTags.getTags();
+        if (tags.size() == 0 && contentTags.getParseException() == false) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+
+        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+            Integer count = tags.get(e.getKey());
+            if (count == null) {
+                data.put(e.getValue(), ZERO);
+            } else {
+                data.put(e.getValue(), Integer.toString(count));
+            }
+        }
+
+        if (contentTags.getParseException()) {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+        } else {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+        }
+        try {
+            writer.writeRow(tagsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+
     String getTime(Metadata m) {
         String elapsed = "-1";
 
@@ -459,14 +515,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
     /**
      * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
      *
-     * @param metadata
+     * @param contentTags
      * @param maxLength
      * @param data
      * @return
      */
-    protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
+    protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
         data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
-        String c = getContent(metadata);
+        if (contentTags == null) {
+            return "";
+        }
+        String c = contentTags.getContent();
         if (maxLength > -1 && c.length() > maxLength) {
             c = c.substring(0, maxLength);
             data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
@@ -474,19 +533,15 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         return c;
 
     }
-    protected static String getContent(Metadata metadata) {
+    protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
         if (metadata == null) {
-            return "";
+            return ContentTags.EMPTY_CONTENT_TAGS;
         }
-        String c = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
-        if (c == null) {
-            return "";
-        }
-        return c;
+        return parseContentAndTags(evalFilePaths, metadata);
     }
 
-    void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata);
+    void unicodeBlocks(ContentTags contentTags, Map<Cols, String> data) {
+        String content = contentTags.getContent();
         if (content.length() < 200) {
             return;
         }
@@ -537,8 +592,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
     }
 
-    void langid(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata);
+    void langid(ContentTags contentTags, Map<Cols, String> data) {
+        String content = contentTags.getContent();
         if (content.length() < 50) {
             return;
         }
@@ -765,5 +820,38 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
             sb.append(parts[i]);
         }
     }
+
+    private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
+        String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        if (s == null || s.length() == 0) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+
+        String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
+        if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
+            try {
+                return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (IOException|SAXException e) {
+                LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        } else if (
+                evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
+                (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+            try {
+                return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (TikaException|IOException|SAXException e) {
+                LOG.warn("Problem parsing xhtml in {}; backing off to treat string as text",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        }
+        return new ContentTags(s);
+    }
+
+
 }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 1ff5f0b..86d1672 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -42,9 +42,9 @@ import org.apache.tika.eval.io.IDBWriter;
 import org.apache.tika.eval.tokens.ContrastStatistics;
 import org.apache.tika.eval.tokens.TokenContraster;
 import org.apache.tika.eval.tokens.TokenIntPair;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 
 public class ExtractComparer extends AbstractProfiler {
@@ -148,6 +148,13 @@ public class ExtractComparer extends AbstractProfiler {
     public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
             ExtractProfiler.CONTENTS_TABLE.getColInfos());
 
+    public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+    public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+
     public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
             ExtractProfiler.EXCEPTION_TABLE.getColInfos());
 
@@ -275,9 +282,14 @@ public class ExtractComparer extends AbstractProfiler {
                 //the first file should have the same id as the container id
                 String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
                 Metadata metadataA = metadataListA.get(i);
+                ContentTags contentTagsA = getContent(fpsA, metadataA);
+                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
                 Metadata metadataB = null;
+
                 //TODO: shouldn't be fileA!!!!
-                writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
+                writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
+
+                writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
                 writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
                 int matchIndex = getMatch(i, metadataListA, metadataListB);
 
@@ -286,7 +298,9 @@ public class ExtractComparer extends AbstractProfiler {
                     handledB.add(matchIndex);
                 }
                 if (metadataB != null) {
-                    writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                    contentTagsB = getContent(fpsB, metadataB);
+                    writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                    writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                     writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
                 }
                 writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
@@ -295,8 +309,8 @@ public class ExtractComparer extends AbstractProfiler {
                 tokenCounter.clear(FIELD_B);
                 //write content
                 try {
-                    writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A);
-                    writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+                    writeContentData(fileId, contentTagsA, FIELD_A, CONTENTS_TABLE_A);
+                    writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                 } catch (IOException e) {
                     throw new RuntimeException(e);
                 }
@@ -327,9 +341,11 @@ public class ExtractComparer extends AbstractProfiler {
                     continue;
                 }
                 Metadata metadataB = metadataListB.get(i);
+                ContentTags contentTagsB = getContent(fpsB, metadataB);
                 //the first file should have the same id as the container id
                 String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
-                writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                 writeEmbeddedFilePathData(i, fileId, null, metadataB);
                 writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
 
@@ -337,7 +353,7 @@ public class ExtractComparer extends AbstractProfiler {
                 tokenCounter.clear(FIELD_B);
                 //write content
                 try {
-                    writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+                    writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                 } catch (IOException e) {
                     throw new RuntimeException(e);
                 }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 200bf33..ccb5011 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.eval;
 
-
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.Types;
@@ -35,8 +34,8 @@ import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.ExtractReader;
 import org.apache.tika.eval.io.ExtractReaderException;
 import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 
 public class ExtractProfiler extends AbstractProfiler {
@@ -154,6 +153,24 @@ public class ExtractProfiler extends AbstractProfiler {
             new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
     );
 
+    public static TableInfo TAGS_TABLE = new TableInfo("tags",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.TAGS_A, Types.INTEGER),
+            new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
+            new ColInfo(Cols.TAGS_I, Types.INTEGER),
+            new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
+            new ColInfo(Cols.TAGS_LI, Types.INTEGER),
+            new ColInfo(Cols.TAGS_OL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_P, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TD, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TR, Types.INTEGER),
+            new ColInfo(Cols.TAGS_U, Types.INTEGER),
+            new ColInfo(Cols.TAGS_UL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
+    );
+
     private final Path inputDir;
     private final Path extracts;
     private final ExtractReader extractReader;
@@ -224,13 +241,15 @@ public class ExtractProfiler extends AbstractProfiler {
         List<Integer> numAttachments = countAttachments(metadataList);
         int i = 0;
         for (Metadata m : metadataList) {
+            ContentTags contentTags = getContent(fps, m);
             //the first file should have the same id as the container id
             String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
-            writeProfileData(fps, i, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
+            writeTagData(fileId, contentTags, TAGS_TABLE);
+            writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
             writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
             writeExceptionData(fileId, m, EXCEPTION_TABLE);
             try {
-                writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
+                writeContentData(fileId, contentTags, FIELD, CONTENTS_TABLE);
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index 3cd428a..909032c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
         List<TableInfo> tableInfosAandB = new ArrayList<>();
         tableInfosA.add(ExtractComparer.PROFILES_A);
         tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+        tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
         tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
         tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
         tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
@@ -53,6 +54,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
         tableInfosB.add(ExtractComparer.PROFILES_B);
         tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
         tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+        tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
         tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
         tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index 11310ee..729460b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
         tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
         tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
         tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+        tableInfos.add(ExtractProfiler.TAGS_TABLE);
         tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
         this.tableInfos = Collections.unmodifiableList(tableInfos);
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index e29598d..3fa8cb5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.eval.db;
 
+import java.sql.Types;
+
 public enum Cols {
     //container table
     CONTAINER_ID,
@@ -86,6 +88,24 @@ public enum Cols {
     MIME_STRING,//string representation of mime type
 
     DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
-    DIR_NAME_B
+    DIR_NAME_B,
+    
+    //structure tags
+    TAGS_A,
+    TAGS_DIV,
+    TAGS_I,
+    TAGS_IMG,
+    TAGS_LI,
+    TAGS_P,
+    TAGS_OL,
+    TAGS_TABLE,
+    TAGS_TD,
+    TAGS_TITLE,
+    TAGS_TR,
+    TAGS_UL,
+    TAGS_U,
+    TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
+    ;
+
 }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index d406919..b45a688 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.eval.io;
 
 import java.io.BufferedReader;
@@ -10,6 +26,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -23,27 +40,13 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 
 public class ExtractReader {
     private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
@@ -90,7 +93,7 @@ public class ExtractReader {
         }
 
         FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
-        if (fileSuffixes.txtOrJson == null) {
+        if (fileSuffixes.format == null) {
             throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
         }
         if (! Files.isRegularFile(extractFile)) {
@@ -138,7 +141,7 @@ public class ExtractReader {
         }
 
         try {
-            if (fileSuffixes.txtOrJson.equals("json")) {
+            if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
                 metadataList = JsonMetadataList.fromJson(reader);
                 if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
                     while (metadataList.size() > 1) {
@@ -181,6 +184,11 @@ public class ExtractReader {
         String content = IOUtils.toString(reader);
         Metadata m = new Metadata();
         m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+        if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
+        } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
+        }
         //Let's hope the file name has a suffix that can
         //be used to determine the mime.  Could be wrong or missing,
         //but better than nothing.
@@ -200,18 +208,37 @@ public class ExtractReader {
         if (fName == null) {
             return fileSuffixes;
         }
-        Matcher m = Pattern.compile("^(.*?)\\.(json|txt)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+        Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
         if (m.find()) {
             fileSuffixes.originalFileName = m.group(1);
-            fileSuffixes.txtOrJson = m.group(2);
+            fileSuffixes.setFormat(m.group(2));
             fileSuffixes.compression = m.group(3);
         }
         return fileSuffixes;
     }
 
     private static class FileSuffixes {
+
+        enum FORMAT {
+            TXT,
+            HTML,
+            JSON
+        }
         String compression;
-        String txtOrJson;
+        FORMAT format;
         String originalFileName;
+
+        public void setFormat(String fmt) {
+            String lc = fmt.toLowerCase(Locale.ENGLISH);
+            if (lc.equals("json")) {
+                format = FORMAT.JSON;
+            } else if (lc.equals("txt")) {
+                format = FORMAT.TXT;
+            } else if (lc.contains("html")) {
+                format = FORMAT.HTML;
+            } else {
+                throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
+            }
+        }
     }
 }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
new file mode 100644
index 0000000..c971d13
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.util;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+public class ContentTagParser {
+
+    private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+    public static ContentTags parseXML(String html, Set<String> uppercaseTagsOfInterest)
+            throws TikaException, IOException, SAXException {
+        Map<String, Integer> tags = new HashMap<>();
+        XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+        XMLReaderUtils.parseSAX(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)),
+                xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT);
+        return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+    }
+    
+    public static ContentTags parseHTML(String html, Set<String> uppercaseTagsOfInterest) throws SAXException, IOException {
+        Map<String, Integer> tags = new HashMap<>();
+        XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+        SAXParserImpl.newInstance(null).parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler);
+        return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+    }
+
+
+    private static class XHTMLContentTagHandler extends ToTextContentHandler {
+        //Used to have a stack to make sure that starting/ending tags were matched
+        //However, this was a non-starter because tag soup fixes non-matching tags for html
+        //and the straight SAXParser throws an exception for mismatched tags in xml
+
+        private final Map<String, Integer> tags;
+        private final Set<String> uppercaseTagsOfInterest;
+
+        public XHTMLContentTagHandler(Set<String> uppercaseTagsOfInterest, Map<String, Integer> tags) {
+            this.uppercaseTagsOfInterest = uppercaseTagsOfInterest;
+            this.tags = tags;
+        }
+
+        @Override
+        public void startElement(
+                String uri, String localName, String qName, Attributes atts)
+                throws SAXException {
+            super.startElement(uri, localName, qName, atts);
+            String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH);
+            if (uppercaseTagsOfInterest.contains(uc)) {
+                Integer i = tags.get(uc);
+                if (i == null) {
+                    i = 1;
+                } else {
+                    i++;
+                }
+                tags.put(uc, i);
+            }
+        }
+    }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
new file mode 100644
index 0000000..115976f
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.util;
+
+import java.util.Collections;
+import java.util.Map;
+
+public class ContentTags {
+
+    public static final ContentTags EMPTY_CONTENT_TAGS = new ContentTags();
+    final Map<String, Integer> tags;
+    final String content;
+    final boolean parseException;
+
+    private ContentTags() {
+        this("", Collections.EMPTY_MAP, false);
+    }
+
+    public ContentTags(String content) {
+        this(content, Collections.emptyMap(), false);
+    }
+
+    public ContentTags(String content, boolean parseException) {
+        this(content, Collections.emptyMap(), parseException);
+    }
+
+    public ContentTags(String content, Map<String, Integer> tags) {
+        this(content, tags, false);
+    }
+
+    private ContentTags(String content, Map<String, Integer> tags,
+                        boolean parseException) {
+        this.content = content;
+        this.tags = tags;
+        this.parseException = parseException;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public Map<String, Integer> getTags() {
+        return tags;
+    }
+
+    public boolean getParseException() {
+        return parseException;
+    }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index de09fa1..ab3dfb2 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -17,7 +17,6 @@
 package org.apache.tika.eval;
 
 import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE;
-import static org.apache.tika.eval.AbstractProfiler.getContent;
 import static org.apache.tika.eval.io.ExtractReader.IGNORE_LENGTH;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
@@ -38,11 +37,12 @@ import org.apache.tika.eval.db.Cols;
 import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.ExtractReader;
 import org.apache.tika.eval.io.ExtractReaderException;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.eval.util.LanguageIDWrapper;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 
@@ -53,18 +53,24 @@ import org.junit.Test;
 public class SimpleComparerTest extends TikaTest {
 
     private ExtractComparer comparer = null;
-    private MockDBWriter writer = null;
+    private static MockDBWriter WRITER;
+
+    @BeforeClass
+    public static void staticSetUp() throws Exception {
+        WRITER = new MockDBWriter();
+        AbstractProfiler.loadCommonTokens(
+                Paths.get(SimpleComparerTest.class.getResource("/common_tokens").toURI()), "en");
+        LanguageIDWrapper.loadBuiltInModels();
+    }
 
     @Before
     public void setUp() throws Exception {
-        writer = new MockDBWriter();
+        WRITER.clear();
         comparer = new ExtractComparer(null, null,
                 Paths.get("extractsA"), Paths.get("extractsB"),
                 new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
                         IGNORE_LENGTH, IGNORE_LENGTH),
-                writer);
-        AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
-        LanguageIDWrapper.loadBuiltInModels();
+                WRITER);
     }
 
     @Test
@@ -79,16 +85,14 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
         Map<Cols, String> row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertTrue(
                 row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A)
                         .startsWith("1,200: 1 | 120000: 1 | over: 1"));
 
-        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
         row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertEquals("70", row.get(Cols.CONTENT_LENGTH));
         assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("14", row.get(Cols.NUM_TOKENS));
@@ -97,9 +101,8 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
 
-        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B);
+        tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_B);
         row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertEquals("76", row.get(Cols.CONTENT_LENGTH));
         assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("13", row.get(Cols.NUM_TOKENS));
@@ -107,7 +110,7 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
 
-        tableInfos = writer.getTable(ExtractComparer.PROFILES_A);
+        tableInfos = WRITER.getTable(ExtractComparer.PROFILES_A);
         row = tableInfos.get(0);
         assertEquals("2", row.get(Cols.NUM_PAGES));
 
@@ -125,7 +128,7 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
 
         Map<Cols, String> row = tableInfos.get(0);
         assertEquals("133", row.get(Cols.CONTENT_LENGTH));
@@ -154,7 +157,7 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
 
         Map<Cols, String> row = tableInfos.get(0);
         assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
@@ -174,9 +177,8 @@ public class SimpleComparerTest extends TikaTest {
                 getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()
         );
         comparer.compareFiles(fpsA, fpsB);
-        List<Map<Cols, String>> table = writer.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+        List<Map<Cols, String>> table = WRITER.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
         Map<Cols, String> row = table.get(0);
-        //debugPrintRow(row);
         assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
                 row.get(Cols.EXTRACT_EXCEPTION_ID));
     }
@@ -184,24 +186,23 @@ public class SimpleComparerTest extends TikaTest {
 
     @Test
     public void testGetContent() throws Exception {
-        Metadata m = new Metadata();
-        m.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "0123456789");
+        ContentTags contentTags = new ContentTags("0123456789");
         Map<Cols, String> data = new HashMap<>();
-        String content = getContent(m, 10, data);
+        String content = AbstractProfiler.truncateContent(contentTags, 10, data);
         assertEquals(10, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
-        content = getContent(m, 4, data);
+        content = AbstractProfiler.truncateContent(contentTags, 4, data);
         assertEquals(4, content.length());
         assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test Metadata with no content
-        content = getContent(new Metadata(), 10, data);
+        content = AbstractProfiler.truncateContent(ContentTags.EMPTY_CONTENT_TAGS, 10, data);
         assertEquals(0, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test null Metadata
-        content = getContent(null, 10, data);
+        content = AbstractProfiler.truncateContent(null, 10, data);
         assertEquals(0, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
     }
@@ -218,10 +219,9 @@ public class SimpleComparerTest extends TikaTest {
         );
         comparer.compareFiles(fpsA, fpsB);
         for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) {
-            List<Map<Cols, String>> table = writer.getTable(t);
+            List<Map<Cols, String>> table = WRITER.getTable(t);
 
             Map<Cols, String> rowA = table.get(0);
-            //debugPrintRow(rowA);
             assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()),
                     rowA.get(Cols.PARSE_EXCEPTION_ID));
             assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
@@ -229,7 +229,6 @@ public class SimpleComparerTest extends TikaTest {
         }
     }
 
-
     @Test
     public void testAttachmentCounts() {
         List<Metadata> list = new ArrayList<>();
@@ -276,7 +275,7 @@ public class SimpleComparerTest extends TikaTest {
                 getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
         );
         comparer.compareFiles(fpsA, fpsB);
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
         assertEquals(3, tableInfos.size());
         for (int i = 0; i < tableInfos.size(); i++) {
             assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
@@ -284,6 +283,69 @@ public class SimpleComparerTest extends TikaTest {
     }
 
     @Test
+    public void testTags() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file15_tags.json"),
+                getResourceAsFile("/test-dirs/extractsA/file15_tags.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file15_tags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file15_tags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("18", tableInfoA.get(Cols.TAGS_P));
+        assertEquals("1", tableInfoA.get(Cols.TAGS_DIV));
+        assertEquals("1", tableInfoA.get(Cols.TAGS_TITLE));
+
+        List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+        assertEquals(1, tableInfosB.size());
+        Map<Cols, String> tableInfoB = tableInfosB.get(0);
+        assertEquals("18", tableInfoB.get(Cols.TAGS_DIV));
+        assertEquals("1", tableInfoB.get(Cols.TAGS_IMG));
+    }
+
+    @Test
+    public void testBadTags() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file16_badtags.json"),
+                getResourceAsFile("/test-dirs/extractsA/file16_badTags.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file16_badtags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+
+        List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+        assertEquals(1, tableInfosB.size());
+        Map<Cols, String> tableInfoB = tableInfosB.get(0);
+        //there actually is a tag problem, but tagsoup fixes it.
+        //this confirms behavior.
+        assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION));
+    }
+
+    @Test
+    public void testTagsOutOfOrder() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file17_tagsOutOfOrder.json"),
+                getResourceAsFile("/test-dirs/extractsA/file17_tagsOutOfOrder.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file16_badtags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file16_badtags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+    }
+
+    @Test
     @Ignore
     public void testDebug() throws Exception {
         Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
@@ -313,7 +375,7 @@ public class SimpleComparerTest extends TikaTest {
     }
 
     private void debugPrintTable(TableInfo tableInfo) {
-        List<Map<Cols, String>> table = writer.getTable(tableInfo);
+        List<Map<Cols, String>> table = WRITER.getTable(tableInfo);
         if (table == null) {
             return;
         }
@@ -337,10 +399,10 @@ public class SimpleComparerTest extends TikaTest {
     }
 
     @Test
-    @Ignore("useful for testing 2 files not in test set")
+    //@Ignore("useful for testing 2 files not in test set")
     public void oneOff() throws Exception {
-        Path p1 = Paths.get("");
-        Path p2 = Paths.get("");
+        Path p1 = Paths.get("C:\\Users\\tallison\\Downloads\\asfasdf\\AQRJRPYMH3PNNK2HLOOKKR4B3QOVWOUH_1_19_1.rar.json");
+        Path p2 = Paths.get("C:\\Users\\tallison\\Downloads\\asfasdf\\AQRJRPYMH3PNNK2HLOOKKR4B3QOVWOUH_1_20.rar.json");
 
         EvalFilePaths fpsA = new EvalFilePaths(
                 Paths.get("file1.pdf.json"),
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
new file mode 100644
index 0000000..5af73db
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
new file mode 100644
index 0000000..5c6272e
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003c\u003c\u003c\u003c\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
new file mode 100644
index 0000000..97afec8
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
new file mode 100644
index 0000000..a08be46
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika - Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
new file mode 100644
index 0000000..19ed27c
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache <i><b>bad tag</i></b>- Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>