You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 16:43:06 UTC

[tika] branch master updated (d838bd7 -> 1ac6a3b)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


 discard d838bd7  TIKA-2791 -- add tags/structure to tika-eval
     new 1ac6a3b  TIKA-2791 -- add tags/structure to tika-eval

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (d838bd7)
            \
             N -- N -- N   refs/heads/master (1ac6a3b)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/test/java/org/apache/tika/eval/SimpleComparerTest.java      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

[tika] 01/01: TIKA-2791 -- add tags/structure to tika-eval

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1ac6a3bd8601dc3376ce01786f115b877b9d338f
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 11:00:37 2018 -0500

    TIKA-2791 -- add tags/structure to tika-eval
---
 .../sax/AbstractRecursiveParserWrapperHandler.java |   4 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |   1 +
 tika-eval/pom.xml                                  |   6 +-
 .../org/apache/tika/eval/AbstractProfiler.java     | 142 +++++++++++++++++----
 .../java/org/apache/tika/eval/ExtractComparer.java |  30 ++++-
 .../java/org/apache/tika/eval/ExtractProfiler.java |  27 +++-
 .../tika/eval/batch/ExtractComparerBuilder.java    |   2 +
 .../tika/eval/batch/ExtractProfilerBuilder.java    |   1 +
 .../main/java/org/apache/tika/eval/db/Cols.java    |  22 +++-
 .../org/apache/tika/eval/io/ExtractReader.java     |  71 +++++++----
 .../apache/tika/eval/util/ContentTagParser.java    |  89 +++++++++++++
 .../org/apache/tika/eval/util/ContentTags.java     |  63 +++++++++
 .../org/apache/tika/eval/SimpleComparerTest.java   | 120 ++++++++++++-----
 .../resources/test-dirs/extractsA/file15_tags.json |  41 ++++++
 .../test-dirs/extractsA/file16_badTags.json        |  41 ++++++
 .../test-dirs/extractsA/file17_tagsOutOfOrder.json |  41 ++++++
 .../resources/test-dirs/extractsB/file15_tags.html |  31 +++++
 .../test-dirs/extractsB/file16_badTags.html        |  31 +++++
 18 files changed, 672 insertions(+), 91 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index d53f18e..55f5c58 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -36,6 +36,10 @@ import java.nio.charset.Charset;
 public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable {
 
     public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
+    /**
+     * Simple class name of the content handler
+     */
+    public final static Property TIKA_CONTENT_HANDLER = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content_handler");
     public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
     public final static Property WRITE_LIMIT_REACHED =
             Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 5faf3a4..408598f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -114,6 +114,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
             String content = handler.toString();
             if (content != null && content.trim().length() > 0 ) {
                 metadata.add(TIKA_CONTENT, content);
+                metadata.add(TIKA_CONTENT_HANDLER, handler.getClass().getSimpleName());
             }
         }
     }
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 262910e..5cf5bed 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,7 +124,11 @@
             <artifactId>poi-scratchpad</artifactId>
             <version>${poi.version}</version>
         </dependency>
-
+        <dependency>
+            <groupId>org.ccil.cowan.tagsoup</groupId>
+            <artifactId>tagsoup</artifactId>
+            <version>1.2.1</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-batch</artifactId>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 0b41acb..aa999dd 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -56,21 +56,25 @@ import org.apache.tika.eval.tokens.CommonTokenResult;
 import org.apache.tika.eval.tokens.TokenCounter;
 import org.apache.tika.eval.tokens.TokenIntPair;
 import org.apache.tika.eval.tokens.TokenStatistics;
+import org.apache.tika.eval.util.ContentTags;
+import org.apache.tika.eval.util.ContentTagParser;
 import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.apache.tika.utils.ExceptionUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
 
 public abstract class AbstractProfiler extends FileResourceConsumer {
 
     private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
 
-
     private static final String[] EXTRACT_EXTENSIONS = {
             ".json",
             ".txt",
@@ -103,14 +107,35 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
     public static final String TRUE = Boolean.toString(true);
     public static final String FALSE = Boolean.toString(false);
+    private static final String ZERO = "0";
 
 
     protected static final AtomicInteger ID = new AtomicInteger();
 
-    private final static String UNKNOWN_EXTENSION = "unk";
+    private static final String UNKNOWN_EXTENSION = "unk";
     //make this configurable
-    private final static String DIGEST_KEY = "X-TIKA:digest:MD5";
-
+    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+
+    private static Map<String, Cols> initTags() {
+        //simplify this mess
+        Map<String, Cols> tmp = new HashMap<>();
+        tmp.put("A", Cols.TAGS_A);
+        tmp.put("DIV", Cols.TAGS_DIV);
+        tmp.put("I", Cols.TAGS_I);
+        tmp.put("IMG", Cols.TAGS_IMG);
+        tmp.put("LI", Cols.TAGS_LI);
+        tmp.put("OL", Cols.TAGS_OL);
+        tmp.put("P", Cols.TAGS_P);
+        tmp.put("TABLE", Cols.TAGS_TABLE);
+        tmp.put("TD", Cols.TAGS_TD);
+        tmp.put("TITLE", Cols.TAGS_TITLE);
+        tmp.put("TR", Cols.TAGS_TR);
+        tmp.put("U", Cols.TAGS_U);
+        tmp.put("UL", Cols.TAGS_UL);
+        return Collections.unmodifiableMap(tmp);
+    }
     private static CommonTokenCountManager commonTokenCountManager;
     private String lastExtractExtension = null;
 
@@ -230,7 +255,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
     }
 
-    protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
+    protected void writeProfileData(EvalFilePaths fps, int i,
+                                    ContentTags contentTags, Metadata m,
                                     String fileId, String containerId,
                                     List<Integer> numAttachments, TableInfo profileTable) {
 
@@ -275,7 +301,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         data.put(Cols.ELAPSED_TIME_MILLIS,
                 getTime(m));
 
-        String content = getContent(m);
+        String content = contentTags.getContent();
         if (content == null || content.trim().length() == 0) {
             data.put(Cols.HAS_CONTENT, FALSE);
         } else {
@@ -331,17 +357,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
      * entered into the content table.
      *
      * @param fileId
-     * @param m
+     * @param contentTags
      * @param fieldName
      * @param contentsTable
      */
-    protected void writeContentData(String fileId, Metadata m,
+    protected void writeContentData(String fileId, ContentTags contentTags,
                                     String fieldName, TableInfo contentsTable) throws IOException {
-        if (m == null) {
+        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
             return;
         }
         Map<Cols, String> data = new HashMap<>();
-        String content = getContent(m, maxContentLength, data);
+        String content = truncateContent(contentTags, maxContentLength, data);
         if (content == null || content.trim().length() == 0) {
             return;
         }
@@ -350,7 +376,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
         data.put(Cols.ID, fileId);
         data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
-        langid(m, data);
+        langid(contentTags, data);
         String langid = data.get(Cols.LANG_ID_1);
         langid = (langid == null) ? "" : langid;
 
@@ -383,7 +409,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
 
         data.put(Cols.TOKEN_LENGTH_STD_DEV,
                 Double.toString(summStats.getStandardDeviation()));
-        unicodeBlocks(m, data);
+        unicodeBlocks(contentTags, data);
         try {
             writer.writeRow(contentsTable, data);
         } catch (IOException e) {
@@ -391,6 +417,36 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         }
     }
 
+    void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
+        Map<String, Integer> tags = contentTags.getTags();
+        if (tags.size() == 0 && contentTags.getParseException() == false) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+
+        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+            Integer count = tags.get(e.getKey());
+            if (count == null) {
+                data.put(e.getValue(), ZERO);
+            } else {
+                data.put(e.getValue(), Integer.toString(count));
+            }
+        }
+
+        if (contentTags.getParseException()) {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+        } else {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+        }
+        try {
+            writer.writeRow(tagsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+
     String getTime(Metadata m) {
         String elapsed = "-1";
 
@@ -459,14 +515,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
     /**
      * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
      *
-     * @param metadata
+     * @param contentTags
      * @param maxLength
      * @param data
      * @return
      */
-    protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
+    protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
         data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
-        String c = getContent(metadata);
+        if (contentTags == null) {
+            return "";
+        }
+        String c = contentTags.getContent();
         if (maxLength > -1 && c.length() > maxLength) {
             c = c.substring(0, maxLength);
             data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
@@ -474,19 +533,15 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         return c;
 
     }
-    protected static String getContent(Metadata metadata) {
+    protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
         if (metadata == null) {
-            return "";
+            return ContentTags.EMPTY_CONTENT_TAGS;
         }
-        String c = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
-        if (c == null) {
-            return "";
-        }
-        return c;
+        return parseContentAndTags(evalFilePaths, metadata);
     }
 
-    void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata);
+    void unicodeBlocks(ContentTags contentTags, Map<Cols, String> data) {
+        String content = contentTags.getContent();
         if (content.length() < 200) {
             return;
         }
@@ -537,8 +592,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
     }
 
-    void langid(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata);
+    void langid(ContentTags contentTags, Map<Cols, String> data) {
+        String content = contentTags.getContent();
         if (content.length() < 50) {
             return;
         }
@@ -765,5 +820,38 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
             sb.append(parts[i]);
         }
     }
+
+    private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
+        String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        if (s == null || s.length() == 0) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+
+        String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
+        if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
+            try {
+                return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (IOException|SAXException e) {
+                LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        } else if (
+                evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
+                (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+            try {
+                return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (TikaException|IOException|SAXException e) {
+                LOG.warn("Problem parsing xhtml in {}; backing off to treat string as text",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        }
+        return new ContentTags(s);
+    }
+
+
 }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 1ff5f0b..86d1672 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -42,9 +42,9 @@ import org.apache.tika.eval.io.IDBWriter;
 import org.apache.tika.eval.tokens.ContrastStatistics;
 import org.apache.tika.eval.tokens.TokenContraster;
 import org.apache.tika.eval.tokens.TokenIntPair;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 
 public class ExtractComparer extends AbstractProfiler {
@@ -148,6 +148,13 @@ public class ExtractComparer extends AbstractProfiler {
     public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
             ExtractProfiler.CONTENTS_TABLE.getColInfos());
 
+    public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+    public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+
     public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
             ExtractProfiler.EXCEPTION_TABLE.getColInfos());
 
@@ -275,9 +282,14 @@ public class ExtractComparer extends AbstractProfiler {
                 //the first file should have the same id as the container id
                 String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
                 Metadata metadataA = metadataListA.get(i);
+                ContentTags contentTagsA = getContent(fpsA, metadataA);
+                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
                 Metadata metadataB = null;
+
                 //TODO: shouldn't be fileA!!!!
-                writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
+                writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
+
+                writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
                 writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
                 int matchIndex = getMatch(i, metadataListA, metadataListB);
 
@@ -286,7 +298,9 @@ public class ExtractComparer extends AbstractProfiler {
                     handledB.add(matchIndex);
                 }
                 if (metadataB != null) {
-                    writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                    contentTagsB = getContent(fpsB, metadataB);
+                    writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                    writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                     writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
                 }
                 writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
@@ -295,8 +309,8 @@ public class ExtractComparer extends AbstractProfiler {
                 tokenCounter.clear(FIELD_B);
                 //write content
                 try {
-                    writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A);
-                    writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+                    writeContentData(fileId, contentTagsA, FIELD_A, CONTENTS_TABLE_A);
+                    writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                 } catch (IOException e) {
                     throw new RuntimeException(e);
                 }
@@ -327,9 +341,11 @@ public class ExtractComparer extends AbstractProfiler {
                     continue;
                 }
                 Metadata metadataB = metadataListB.get(i);
+                ContentTags contentTagsB = getContent(fpsB, metadataB);
                 //the first file should have the same id as the container id
                 String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
-                writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                 writeEmbeddedFilePathData(i, fileId, null, metadataB);
                 writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
 
@@ -337,7 +353,7 @@ public class ExtractComparer extends AbstractProfiler {
                 tokenCounter.clear(FIELD_B);
                 //write content
                 try {
-                    writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+                    writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                 } catch (IOException e) {
                     throw new RuntimeException(e);
                 }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 200bf33..ccb5011 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.eval;
 
-
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.Types;
@@ -35,8 +34,8 @@ import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.ExtractReader;
 import org.apache.tika.eval.io.ExtractReaderException;
 import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 
 public class ExtractProfiler extends AbstractProfiler {
@@ -154,6 +153,24 @@ public class ExtractProfiler extends AbstractProfiler {
             new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
     );
 
+    public static TableInfo TAGS_TABLE = new TableInfo("tags",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.TAGS_A, Types.INTEGER),
+            new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
+            new ColInfo(Cols.TAGS_I, Types.INTEGER),
+            new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
+            new ColInfo(Cols.TAGS_LI, Types.INTEGER),
+            new ColInfo(Cols.TAGS_OL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_P, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TD, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TR, Types.INTEGER),
+            new ColInfo(Cols.TAGS_U, Types.INTEGER),
+            new ColInfo(Cols.TAGS_UL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
+    );
+
     private final Path inputDir;
     private final Path extracts;
     private final ExtractReader extractReader;
@@ -224,13 +241,15 @@ public class ExtractProfiler extends AbstractProfiler {
         List<Integer> numAttachments = countAttachments(metadataList);
         int i = 0;
         for (Metadata m : metadataList) {
+            ContentTags contentTags = getContent(fps, m);
             //the first file should have the same id as the container id
             String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
-            writeProfileData(fps, i, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
+            writeTagData(fileId, contentTags, TAGS_TABLE);
+            writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
             writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
             writeExceptionData(fileId, m, EXCEPTION_TABLE);
             try {
-                writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
+                writeContentData(fileId, contentTags, FIELD, CONTENTS_TABLE);
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index 3cd428a..909032c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
         List<TableInfo> tableInfosAandB = new ArrayList<>();
         tableInfosA.add(ExtractComparer.PROFILES_A);
         tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+        tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
         tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
         tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
         tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
@@ -53,6 +54,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
         tableInfosB.add(ExtractComparer.PROFILES_B);
         tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
         tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+        tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
         tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
         tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index 11310ee..729460b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
         tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
         tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
         tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+        tableInfos.add(ExtractProfiler.TAGS_TABLE);
         tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
         this.tableInfos = Collections.unmodifiableList(tableInfos);
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index e29598d..3fa8cb5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.eval.db;
 
+import java.sql.Types;
+
 public enum Cols {
     //container table
     CONTAINER_ID,
@@ -86,6 +88,24 @@ public enum Cols {
     MIME_STRING,//string representation of mime type
 
     DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
-    DIR_NAME_B
+    DIR_NAME_B,
+    
+    //structure tags
+    TAGS_A,
+    TAGS_DIV,
+    TAGS_I,
+    TAGS_IMG,
+    TAGS_LI,
+    TAGS_P,
+    TAGS_OL,
+    TAGS_TABLE,
+    TAGS_TD,
+    TAGS_TITLE,
+    TAGS_TR,
+    TAGS_UL,
+    TAGS_U,
+    TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
+    ;
+
 }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index d406919..b45a688 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.eval.io;
 
 import java.io.BufferedReader;
@@ -10,6 +26,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -23,27 +40,13 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 
 public class ExtractReader {
     private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
@@ -90,7 +93,7 @@ public class ExtractReader {
         }
 
         FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
-        if (fileSuffixes.txtOrJson == null) {
+        if (fileSuffixes.format == null) {
             throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
         }
         if (! Files.isRegularFile(extractFile)) {
@@ -138,7 +141,7 @@ public class ExtractReader {
         }
 
         try {
-            if (fileSuffixes.txtOrJson.equals("json")) {
+            if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
                 metadataList = JsonMetadataList.fromJson(reader);
                 if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
                     while (metadataList.size() > 1) {
@@ -181,6 +184,11 @@ public class ExtractReader {
         String content = IOUtils.toString(reader);
         Metadata m = new Metadata();
         m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+        if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
+        } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
+        }
         //Let's hope the file name has a suffix that can
         //be used to determine the mime.  Could be wrong or missing,
         //but better than nothing.
@@ -200,18 +208,37 @@ public class ExtractReader {
         if (fName == null) {
             return fileSuffixes;
         }
-        Matcher m = Pattern.compile("^(.*?)\\.(json|txt)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+        Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
         if (m.find()) {
             fileSuffixes.originalFileName = m.group(1);
-            fileSuffixes.txtOrJson = m.group(2);
+            fileSuffixes.setFormat(m.group(2));
             fileSuffixes.compression = m.group(3);
         }
         return fileSuffixes;
     }
 
     private static class FileSuffixes {
+
+        enum FORMAT {
+            TXT,
+            HTML,
+            JSON
+        }
         String compression;
-        String txtOrJson;
+        FORMAT format;
         String originalFileName;
+
+        public void setFormat(String fmt) {
+            String lc = fmt.toLowerCase(Locale.ENGLISH);
+            if (lc.equals("json")) {
+                format = FORMAT.JSON;
+            } else if (lc.equals("txt")) {
+                format = FORMAT.TXT;
+            } else if (lc.contains("html")) {
+                format = FORMAT.HTML;
+            } else {
+                throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
+            }
+        }
     }
 }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
new file mode 100644
index 0000000..c971d13
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.util;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+public class ContentTagParser {
+
+    private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+    public static ContentTags parseXML(String html, Set<String> uppercaseTagsOfInterest)
+            throws TikaException, IOException, SAXException {
+        Map<String, Integer> tags = new HashMap<>();
+        XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+        XMLReaderUtils.parseSAX(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)),
+                xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT);
+        return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+    }
+    
+    public static ContentTags parseHTML(String html, Set<String> uppercaseTagsOfInterest) throws SAXException, IOException {
+        Map<String, Integer> tags = new HashMap<>();
+        XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+        SAXParserImpl.newInstance(null).parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler);
+        return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+    }
+
+
+    private static class XHTMLContentTagHandler extends ToTextContentHandler {
+        //Used to have a stack to make sure that starting/ending tags were matched
+        //However, this was a non-starter because tag soup fixes non-matching tags for html
+        //and the straight SAXParser throws an exception for mismatched tags in xml
+
+        private final Map<String, Integer> tags;
+        private final Set<String> uppercaseTagsOfInterest;
+
+        public XHTMLContentTagHandler(Set<String> uppercaseTagsOfInterest, Map<String, Integer> tags) {
+            this.uppercaseTagsOfInterest = uppercaseTagsOfInterest;
+            this.tags = tags;
+        }
+
+        @Override
+        public void startElement(
+                String uri, String localName, String qName, Attributes atts)
+                throws SAXException {
+            super.startElement(uri, localName, qName, atts);
+            String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH);
+            if (uppercaseTagsOfInterest.contains(uc)) {
+                Integer i = tags.get(uc);
+                if (i == null) {
+                    i = 1;
+                } else {
+                    i++;
+                }
+                tags.put(uc, i);
+            }
+        }
+    }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
new file mode 100644
index 0000000..115976f
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.util;
+
+import java.util.Collections;
+import java.util.Map;
+
+public class ContentTags {
+
+    public static final ContentTags EMPTY_CONTENT_TAGS = new ContentTags();
+    final Map<String, Integer> tags;
+    final String content;
+    final boolean parseException;
+
+    private ContentTags() {
+        this("", Collections.EMPTY_MAP, false);
+    }
+
+    public ContentTags(String content) {
+        this(content, Collections.emptyMap(), false);
+    }
+
+    public ContentTags(String content, boolean parseException) {
+        this(content, Collections.emptyMap(), parseException);
+    }
+
+    public ContentTags(String content, Map<String, Integer> tags) {
+        this(content, tags, false);
+    }
+
+    private ContentTags(String content, Map<String, Integer> tags,
+                        boolean parseException) {
+        this.content = content;
+        this.tags = tags;
+        this.parseException = parseException;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public Map<String, Integer> getTags() {
+        return tags;
+    }
+
+    public boolean getParseException() {
+        return parseException;
+    }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index de09fa1..78a8ca5 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -17,7 +17,6 @@
 package org.apache.tika.eval;
 
 import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE;
-import static org.apache.tika.eval.AbstractProfiler.getContent;
 import static org.apache.tika.eval.io.ExtractReader.IGNORE_LENGTH;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
@@ -38,11 +37,12 @@ import org.apache.tika.eval.db.Cols;
 import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.ExtractReader;
 import org.apache.tika.eval.io.ExtractReaderException;
+import org.apache.tika.eval.util.ContentTags;
 import org.apache.tika.eval.util.LanguageIDWrapper;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 
@@ -53,18 +53,24 @@ import org.junit.Test;
 public class SimpleComparerTest extends TikaTest {
 
     private ExtractComparer comparer = null;
-    private MockDBWriter writer = null;
+    private static MockDBWriter WRITER;
+
+    @BeforeClass
+    public static void staticSetUp() throws Exception {
+        WRITER = new MockDBWriter();
+        AbstractProfiler.loadCommonTokens(
+                Paths.get(SimpleComparerTest.class.getResource("/common_tokens").toURI()), "en");
+        LanguageIDWrapper.loadBuiltInModels();
+    }
 
     @Before
     public void setUp() throws Exception {
-        writer = new MockDBWriter();
+        WRITER.clear();
         comparer = new ExtractComparer(null, null,
                 Paths.get("extractsA"), Paths.get("extractsB"),
                 new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
                         IGNORE_LENGTH, IGNORE_LENGTH),
-                writer);
-        AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
-        LanguageIDWrapper.loadBuiltInModels();
+                WRITER);
     }
 
     @Test
@@ -79,16 +85,14 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
         Map<Cols, String> row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertTrue(
                 row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A)
                         .startsWith("1,200: 1 | 120000: 1 | over: 1"));
 
-        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
         row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertEquals("70", row.get(Cols.CONTENT_LENGTH));
         assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("14", row.get(Cols.NUM_TOKENS));
@@ -97,9 +101,8 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
 
-        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B);
+        tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_B);
         row = tableInfos.get(0);
-        assertEquals("0", row.get(Cols.ID));
         assertEquals("76", row.get(Cols.CONTENT_LENGTH));
         assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("13", row.get(Cols.NUM_TOKENS));
@@ -107,7 +110,7 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
 
-        tableInfos = writer.getTable(ExtractComparer.PROFILES_A);
+        tableInfos = WRITER.getTable(ExtractComparer.PROFILES_A);
         row = tableInfos.get(0);
         assertEquals("2", row.get(Cols.NUM_PAGES));
 
@@ -125,7 +128,7 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
 
         Map<Cols, String> row = tableInfos.get(0);
         assertEquals("133", row.get(Cols.CONTENT_LENGTH));
@@ -154,7 +157,7 @@ public class SimpleComparerTest extends TikaTest {
 
         comparer.compareFiles(fpsA, fpsB);
 
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
 
         Map<Cols, String> row = tableInfos.get(0);
         assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
@@ -174,9 +177,8 @@ public class SimpleComparerTest extends TikaTest {
                 getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()
         );
         comparer.compareFiles(fpsA, fpsB);
-        List<Map<Cols, String>> table = writer.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+        List<Map<Cols, String>> table = WRITER.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
         Map<Cols, String> row = table.get(0);
-        //debugPrintRow(row);
         assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
                 row.get(Cols.EXTRACT_EXCEPTION_ID));
     }
@@ -184,24 +186,23 @@ public class SimpleComparerTest extends TikaTest {
 
     @Test
     public void testGetContent() throws Exception {
-        Metadata m = new Metadata();
-        m.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "0123456789");
+        ContentTags contentTags = new ContentTags("0123456789");
         Map<Cols, String> data = new HashMap<>();
-        String content = getContent(m, 10, data);
+        String content = AbstractProfiler.truncateContent(contentTags, 10, data);
         assertEquals(10, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
-        content = getContent(m, 4, data);
+        content = AbstractProfiler.truncateContent(contentTags, 4, data);
         assertEquals(4, content.length());
         assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test Metadata with no content
-        content = getContent(new Metadata(), 10, data);
+        content = AbstractProfiler.truncateContent(ContentTags.EMPTY_CONTENT_TAGS, 10, data);
         assertEquals(0, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test null Metadata
-        content = getContent(null, 10, data);
+        content = AbstractProfiler.truncateContent(null, 10, data);
         assertEquals(0, content.length());
         assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
     }
@@ -218,10 +219,9 @@ public class SimpleComparerTest extends TikaTest {
         );
         comparer.compareFiles(fpsA, fpsB);
         for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) {
-            List<Map<Cols, String>> table = writer.getTable(t);
+            List<Map<Cols, String>> table = WRITER.getTable(t);
 
             Map<Cols, String> rowA = table.get(0);
-            //debugPrintRow(rowA);
             assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()),
                     rowA.get(Cols.PARSE_EXCEPTION_ID));
             assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
@@ -229,7 +229,6 @@ public class SimpleComparerTest extends TikaTest {
         }
     }
 
-
     @Test
     public void testAttachmentCounts() {
         List<Metadata> list = new ArrayList<>();
@@ -276,7 +275,7 @@ public class SimpleComparerTest extends TikaTest {
                 getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
         );
         comparer.compareFiles(fpsA, fpsB);
-        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
         assertEquals(3, tableInfos.size());
         for (int i = 0; i < tableInfos.size(); i++) {
             assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
@@ -284,6 +283,69 @@ public class SimpleComparerTest extends TikaTest {
     }
 
     @Test
+    public void testTags() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file15_tags.json"),
+                getResourceAsFile("/test-dirs/extractsA/file15_tags.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file15_tags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file15_tags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("18", tableInfoA.get(Cols.TAGS_P));
+        assertEquals("1", tableInfoA.get(Cols.TAGS_DIV));
+        assertEquals("1", tableInfoA.get(Cols.TAGS_TITLE));
+
+        List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+        assertEquals(1, tableInfosB.size());
+        Map<Cols, String> tableInfoB = tableInfosB.get(0);
+        assertEquals("18", tableInfoB.get(Cols.TAGS_DIV));
+        assertEquals("1", tableInfoB.get(Cols.TAGS_IMG));
+    }
+
+    @Test
+    public void testBadTags() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file16_badtags.json"),
+                getResourceAsFile("/test-dirs/extractsA/file16_badTags.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file16_badtags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+
+        List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+        assertEquals(1, tableInfosB.size());
+        Map<Cols, String> tableInfoB = tableInfosB.get(0);
+        //there actually is a tag problem, but tagsoup fixes it.
+        //this confirms behavior.
+        assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION));
+    }
+
+    @Test
+    public void testTagsOutOfOrder() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file17_tagsOutOfOrder.json"),
+                getResourceAsFile("/test-dirs/extractsA/file17_tagsOutOfOrder.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file16_badtags.html"),
+                getResourceAsFile("/test-dirs/extractsB/file16_badtags.html").toPath());
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+        assertEquals(1, tableInfosA.size());
+        Map<Cols, String> tableInfoA = tableInfosA.get(0);
+        assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+    }
+
+    @Test
     @Ignore
     public void testDebug() throws Exception {
         Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
@@ -313,7 +375,7 @@ public class SimpleComparerTest extends TikaTest {
     }
 
     private void debugPrintTable(TableInfo tableInfo) {
-        List<Map<Cols, String>> table = writer.getTable(tableInfo);
+        List<Map<Cols, String>> table = WRITER.getTable(tableInfo);
         if (table == null) {
             return;
         }
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
new file mode 100644
index 0000000..5af73db
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
new file mode 100644
index 0000000..5c6272e
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003c\u003c\u003c\u003c\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
new file mode 100644
index 0000000..97afec8
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
@@ -0,0 +1,41 @@
+[
+  {
+    "Content-Length": "34824",
+    "Content-Type": "application/pdf",
+    "Last-Modified": "2007-09-15T09:02:31Z",
+    "X-Parsed-By": [
+      "org.apache.tika.parser.DefaultParser",
+      "org.apache.tika.parser.pdf.PDFParser"
+    ],
+    "X-TIKA:content_handler": "ToXMLContentHandler",
+    "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+    "X-TIKA:parse_time_millis": "500",
+    "access_permission:assemble_document": "true",
+    "access_permission:can_modify": "true",
+    "access_permission:can_print": "true",
+    "access_permission:can_print_degraded": "true",
+    "access_permission:extract_content": "true",
+    "access_permission:extract_for_accessibility": "true",
+    "access_permission:fill_in_form": "true",
+    "access_permission:modify_annotations": "true",
+    "dc:creator": "Bertrand DelacrΘtaz",
+    "dc:format": "application/pdf; version\u003d1.3",
+    "dc:title": "Apache Tika - Apache Tika",
+    "dcterms:created": "2007-09-15T09:02:31Z",
+    "dcterms:modified": "2007-09-15T09:02:31Z",
+    "meta:author": "Bertrand DelacrΘtaz",
+    "meta:creation-date": "2007-09-15T09:02:31Z",
+    "meta:save-date": "2007-09-15T09:02:31Z",
+    "pdf:PDFVersion": "1.3",
+    "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+    "pdf:docinfo:creator_tool": "Firefox",
+    "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+    "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+    "pdf:docinfo:title": "Apache Tika - Apache Tika",
+    "pdf:encrypted": "false",
+    "resourceName": "testPDF.pdf",
+    "xmp:CreatorTool": "Firefox",
+    "xmpTPg:NPages": "1"
+  }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
new file mode 100644
index 0000000..a08be46
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika - Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
new file mode 100644
index 0000000..19ed27c
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache <i><b>bad tag</i></b>- Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>