You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 16:37:37 UTC
[tika] 02/02: TIKA-2791 -- add tags/structure to tika-eval
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d838bd7ee0cbb7921a945f0469ebfd2627714a97
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 11:00:37 2018 -0500
TIKA-2791 -- add tags/structure to tika-eval
---
.../sax/AbstractRecursiveParserWrapperHandler.java | 4 +
.../tika/sax/RecursiveParserWrapperHandler.java | 1 +
tika-eval/pom.xml | 6 +-
.../org/apache/tika/eval/AbstractProfiler.java | 142 +++++++++++++++++----
.../java/org/apache/tika/eval/ExtractComparer.java | 30 ++++-
.../java/org/apache/tika/eval/ExtractProfiler.java | 27 +++-
.../tika/eval/batch/ExtractComparerBuilder.java | 2 +
.../tika/eval/batch/ExtractProfilerBuilder.java | 1 +
.../main/java/org/apache/tika/eval/db/Cols.java | 22 +++-
.../org/apache/tika/eval/io/ExtractReader.java | 71 +++++++----
.../apache/tika/eval/util/ContentTagParser.java | 89 +++++++++++++
.../org/apache/tika/eval/util/ContentTags.java | 63 +++++++++
.../org/apache/tika/eval/SimpleComparerTest.java | 126 +++++++++++++-----
.../resources/test-dirs/extractsA/file15_tags.json | 41 ++++++
.../test-dirs/extractsA/file16_badTags.json | 41 ++++++
.../test-dirs/extractsA/file17_tagsOutOfOrder.json | 41 ++++++
.../resources/test-dirs/extractsB/file15_tags.html | 31 +++++
.../test-dirs/extractsB/file16_badTags.html | 31 +++++
18 files changed, 675 insertions(+), 94 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index d53f18e..55f5c58 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -36,6 +36,10 @@ import java.nio.charset.Charset;
public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable {
public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
+ /**
+ * Simple class name of the content handler
+ */
+ public final static Property TIKA_CONTENT_HANDLER = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content_handler");
public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
public final static Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 5faf3a4..408598f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -114,6 +114,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
String content = handler.toString();
if (content != null && content.trim().length() > 0 ) {
metadata.add(TIKA_CONTENT, content);
+ metadata.add(TIKA_CONTENT_HANDLER, handler.getClass().getSimpleName());
}
}
}
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 262910e..5cf5bed 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,7 +124,11 @@
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
-
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-batch</artifactId>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 0b41acb..aa999dd 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -56,21 +56,25 @@ import org.apache.tika.eval.tokens.CommonTokenResult;
import org.apache.tika.eval.tokens.TokenCounter;
import org.apache.tika.eval.tokens.TokenIntPair;
import org.apache.tika.eval.tokens.TokenStatistics;
+import org.apache.tika.eval.util.ContentTags;
+import org.apache.tika.eval.util.ContentTagParser;
import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
public abstract class AbstractProfiler extends FileResourceConsumer {
private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
-
private static final String[] EXTRACT_EXTENSIONS = {
".json",
".txt",
@@ -103,14 +107,35 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
public static final String TRUE = Boolean.toString(true);
public static final String FALSE = Boolean.toString(false);
+ private static final String ZERO = "0";
protected static final AtomicInteger ID = new AtomicInteger();
- private final static String UNKNOWN_EXTENSION = "unk";
+ private static final String UNKNOWN_EXTENSION = "unk";
//make this configurable
- private final static String DIGEST_KEY = "X-TIKA:digest:MD5";
-
+ private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+ private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+
+ private static Map<String, Cols> initTags() {
+ //simplify this mess
+ Map<String, Cols> tmp = new HashMap<>();
+ tmp.put("A", Cols.TAGS_A);
+ tmp.put("DIV", Cols.TAGS_DIV);
+ tmp.put("I", Cols.TAGS_I);
+ tmp.put("IMG", Cols.TAGS_IMG);
+ tmp.put("LI", Cols.TAGS_LI);
+ tmp.put("OL", Cols.TAGS_OL);
+ tmp.put("P", Cols.TAGS_P);
+ tmp.put("TABLE", Cols.TAGS_TABLE);
+ tmp.put("TD", Cols.TAGS_TD);
+ tmp.put("TITLE", Cols.TAGS_TITLE);
+ tmp.put("TR", Cols.TAGS_TR);
+ tmp.put("U", Cols.TAGS_U);
+ tmp.put("UL", Cols.TAGS_UL);
+ return Collections.unmodifiableMap(tmp);
+ }
private static CommonTokenCountManager commonTokenCountManager;
private String lastExtractExtension = null;
@@ -230,7 +255,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
- protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
+ protected void writeProfileData(EvalFilePaths fps, int i,
+ ContentTags contentTags, Metadata m,
String fileId, String containerId,
List<Integer> numAttachments, TableInfo profileTable) {
@@ -275,7 +301,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.ELAPSED_TIME_MILLIS,
getTime(m));
- String content = getContent(m);
+ String content = contentTags.getContent();
if (content == null || content.trim().length() == 0) {
data.put(Cols.HAS_CONTENT, FALSE);
} else {
@@ -331,17 +357,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
* entered into the content table.
*
* @param fileId
- * @param m
+ * @param contentTags
* @param fieldName
* @param contentsTable
*/
- protected void writeContentData(String fileId, Metadata m,
+ protected void writeContentData(String fileId, ContentTags contentTags,
String fieldName, TableInfo contentsTable) throws IOException {
- if (m == null) {
+ if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
return;
}
Map<Cols, String> data = new HashMap<>();
- String content = getContent(m, maxContentLength, data);
+ String content = truncateContent(contentTags, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
@@ -350,7 +376,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
- langid(m, data);
+ langid(contentTags, data);
String langid = data.get(Cols.LANG_ID_1);
langid = (langid == null) ? "" : langid;
@@ -383,7 +409,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.TOKEN_LENGTH_STD_DEV,
Double.toString(summStats.getStandardDeviation()));
- unicodeBlocks(m, data);
+ unicodeBlocks(contentTags, data);
try {
writer.writeRow(contentsTable, data);
} catch (IOException e) {
@@ -391,6 +417,36 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
}
+ void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
+ Map<String, Integer> tags = contentTags.getTags();
+ if (tags.size() == 0 && contentTags.getParseException() == false) {
+ return;
+ }
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+
+ for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+ Integer count = tags.get(e.getKey());
+ if (count == null) {
+ data.put(e.getValue(), ZERO);
+ } else {
+ data.put(e.getValue(), Integer.toString(count));
+ }
+ }
+
+ if (contentTags.getParseException()) {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+ } else {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+ }
+ try {
+ writer.writeRow(tagsTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
String getTime(Metadata m) {
String elapsed = "-1";
@@ -459,14 +515,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
/**
* Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
*
- * @param metadata
+ * @param contentTags
* @param maxLength
* @param data
* @return
*/
- protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
+ protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
- String c = getContent(metadata);
+ if (contentTags == null) {
+ return "";
+ }
+ String c = contentTags.getContent();
if (maxLength > -1 && c.length() > maxLength) {
c = c.substring(0, maxLength);
data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
@@ -474,19 +533,15 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
return c;
}
- protected static String getContent(Metadata metadata) {
+ protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
if (metadata == null) {
- return "";
+ return ContentTags.EMPTY_CONTENT_TAGS;
}
- String c = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
- if (c == null) {
- return "";
- }
- return c;
+ return parseContentAndTags(evalFilePaths, metadata);
}
- void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata);
+ void unicodeBlocks(ContentTags contentTags, Map<Cols, String> data) {
+ String content = contentTags.getContent();
if (content.length() < 200) {
return;
}
@@ -537,8 +592,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
}
- void langid(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata);
+ void langid(ContentTags contentTags, Map<Cols, String> data) {
+ String content = contentTags.getContent();
if (content.length() < 50) {
return;
}
@@ -765,5 +820,38 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
sb.append(parts[i]);
}
}
+
+ private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
+ String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ if (s == null || s.length() == 0) {
+ return ContentTags.EMPTY_CONTENT_TAGS;
+ }
+
+ String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
+ if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
+ try {
+ return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+ } catch (IOException|SAXException e) {
+ LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+ return new ContentTags(s, true);
+ }
+ } else if (
+ evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
+ (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+ try {
+ return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
+ } catch (TikaException|IOException|SAXException e) {
+ LOG.warn("Problem parsing xhtml in {}; backing off to treat string as text",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+ return new ContentTags(s, true);
+ }
+ }
+ return new ContentTags(s);
+ }
+
+
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 1ff5f0b..86d1672 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -42,9 +42,9 @@ import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.eval.tokens.ContrastStatistics;
import org.apache.tika.eval.tokens.TokenContraster;
import org.apache.tika.eval.tokens.TokenIntPair;
+import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
public class ExtractComparer extends AbstractProfiler {
@@ -148,6 +148,13 @@ public class ExtractComparer extends AbstractProfiler {
public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
ExtractProfiler.CONTENTS_TABLE.getColInfos());
+ public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
+ ExtractProfiler.TAGS_TABLE.getColInfos());
+
+ public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
+ ExtractProfiler.TAGS_TABLE.getColInfos());
+
+
public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
ExtractProfiler.EXCEPTION_TABLE.getColInfos());
@@ -275,9 +282,14 @@ public class ExtractComparer extends AbstractProfiler {
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
Metadata metadataA = metadataListA.get(i);
+ ContentTags contentTagsA = getContent(fpsA, metadataA);
+ ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
Metadata metadataB = null;
+
//TODO: shouldn't be fileA!!!!
- writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
+ writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
+
+ writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
int matchIndex = getMatch(i, metadataListA, metadataListB);
@@ -286,7 +298,9 @@ public class ExtractComparer extends AbstractProfiler {
handledB.add(matchIndex);
}
if (metadataB != null) {
- writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+ contentTagsB = getContent(fpsB, metadataB);
+ writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+ writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
}
writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
@@ -295,8 +309,8 @@ public class ExtractComparer extends AbstractProfiler {
tokenCounter.clear(FIELD_B);
//write content
try {
- writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A);
- writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+ writeContentData(fileId, contentTagsA, FIELD_A, CONTENTS_TABLE_A);
+ writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -327,9 +341,11 @@ public class ExtractComparer extends AbstractProfiler {
continue;
}
Metadata metadataB = metadataListB.get(i);
+ ContentTags contentTagsB = getContent(fpsB, metadataB);
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
- writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+ writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+ writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeEmbeddedFilePathData(i, fileId, null, metadataB);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
@@ -337,7 +353,7 @@ public class ExtractComparer extends AbstractProfiler {
tokenCounter.clear(FIELD_B);
//write content
try {
- writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
+ writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
} catch (IOException e) {
throw new RuntimeException(e);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 200bf33..ccb5011 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.eval;
-
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Types;
@@ -35,8 +34,8 @@ import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
public class ExtractProfiler extends AbstractProfiler {
@@ -154,6 +153,24 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
);
+ public static TableInfo TAGS_TABLE = new TableInfo("tags",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.TAGS_A, Types.INTEGER),
+ new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
+ new ColInfo(Cols.TAGS_I, Types.INTEGER),
+ new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
+ new ColInfo(Cols.TAGS_LI, Types.INTEGER),
+ new ColInfo(Cols.TAGS_OL, Types.INTEGER),
+ new ColInfo(Cols.TAGS_P, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TD, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TR, Types.INTEGER),
+ new ColInfo(Cols.TAGS_U, Types.INTEGER),
+ new ColInfo(Cols.TAGS_UL, Types.INTEGER),
+ new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
+ );
+
private final Path inputDir;
private final Path extracts;
private final ExtractReader extractReader;
@@ -224,13 +241,15 @@ public class ExtractProfiler extends AbstractProfiler {
List<Integer> numAttachments = countAttachments(metadataList);
int i = 0;
for (Metadata m : metadataList) {
+ ContentTags contentTags = getContent(fps, m);
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
- writeProfileData(fps, i, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
+ writeTagData(fileId, contentTags, TAGS_TABLE);
+ writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
writeExceptionData(fileId, m, EXCEPTION_TABLE);
try {
- writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
+ writeContentData(fileId, contentTags, FIELD, CONTENTS_TABLE);
} catch (IOException e) {
throw new RuntimeException(e);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index 3cd428a..909032c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
List<TableInfo> tableInfosAandB = new ArrayList<>();
tableInfosA.add(ExtractComparer.PROFILES_A);
tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+ tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
@@ -53,6 +54,7 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
tableInfosB.add(ExtractComparer.PROFILES_B);
tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+ tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index 11310ee..729460b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -46,6 +46,7 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+ tableInfos.add(ExtractProfiler.TAGS_TABLE);
tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
this.tableInfos = Collections.unmodifiableList(tableInfos);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index e29598d..3fa8cb5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.eval.db;
+import java.sql.Types;
+
public enum Cols {
//container table
CONTAINER_ID,
@@ -86,6 +88,24 @@ public enum Cols {
MIME_STRING,//string representation of mime type
DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
- DIR_NAME_B
+ DIR_NAME_B,
+
+ //structure tags
+ TAGS_A,
+ TAGS_DIV,
+ TAGS_I,
+ TAGS_IMG,
+ TAGS_LI,
+ TAGS_P,
+ TAGS_OL,
+ TAGS_TABLE,
+ TAGS_TD,
+ TAGS_TITLE,
+ TAGS_TR,
+ TAGS_UL,
+ TAGS_U,
+ TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
+ ;
+
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index d406919..b45a688 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.eval.io;
import java.io.BufferedReader;
@@ -10,6 +26,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -23,27 +40,13 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
public class ExtractReader {
private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
@@ -90,7 +93,7 @@ public class ExtractReader {
}
FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
- if (fileSuffixes.txtOrJson == null) {
+ if (fileSuffixes.format == null) {
throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
}
if (! Files.isRegularFile(extractFile)) {
@@ -138,7 +141,7 @@ public class ExtractReader {
}
try {
- if (fileSuffixes.txtOrJson.equals("json")) {
+ if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
metadataList = JsonMetadataList.fromJson(reader);
if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
while (metadataList.size() > 1) {
@@ -181,6 +184,11 @@ public class ExtractReader {
String content = IOUtils.toString(reader);
Metadata m = new Metadata();
m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+ if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
+ m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
+ } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
+ m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
+ }
//Let's hope the file name has a suffix that can
//be used to determine the mime. Could be wrong or missing,
//but better than nothing.
@@ -200,18 +208,37 @@ public class ExtractReader {
if (fName == null) {
return fileSuffixes;
}
- Matcher m = Pattern.compile("^(.*?)\\.(json|txt)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+ Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
if (m.find()) {
fileSuffixes.originalFileName = m.group(1);
- fileSuffixes.txtOrJson = m.group(2);
+ fileSuffixes.setFormat(m.group(2));
fileSuffixes.compression = m.group(3);
}
return fileSuffixes;
}
private static class FileSuffixes {
+
+ enum FORMAT {
+ TXT,
+ HTML,
+ JSON
+ }
String compression;
- String txtOrJson;
+ FORMAT format;
String originalFileName;
+
+ public void setFormat(String fmt) {
+ String lc = fmt.toLowerCase(Locale.ENGLISH);
+ if (lc.equals("json")) {
+ format = FORMAT.JSON;
+ } else if (lc.equals("txt")) {
+ format = FORMAT.TXT;
+ } else if (lc.contains("html")) {
+ format = FORMAT.HTML;
+ } else {
+ throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
+ }
+ }
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
new file mode 100644
index 0000000..c971d13
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTagParser.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.util;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+public class ContentTagParser {
+
+ private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+ public static ContentTags parseXML(String html, Set<String> uppercaseTagsOfInterest)
+ throws TikaException, IOException, SAXException {
+ Map<String, Integer> tags = new HashMap<>();
+ XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+ XMLReaderUtils.parseSAX(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)),
+ xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT);
+ return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+ }
+
+ public static ContentTags parseHTML(String html, Set<String> uppercaseTagsOfInterest) throws SAXException, IOException {
+ Map<String, Integer> tags = new HashMap<>();
+ XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
+ SAXParserImpl.newInstance(null).parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler);
+ return new ContentTags(xhtmlContentTagHandler.toString(), tags);
+ }
+
+
+ private static class XHTMLContentTagHandler extends ToTextContentHandler {
+ //Used to have a stack to make sure that starting/ending tags were matched
+ //However, this was a non-starter because tag soup fixes non-matching tags for html
+ //and the straight SAXParser throws an exception for mismatched tags in xml
+
+ private final Map<String, Integer> tags;
+ private final Set<String> uppercaseTagsOfInterest;
+
+ public XHTMLContentTagHandler(Set<String> uppercaseTagsOfInterest, Map<String, Integer> tags) {
+ this.uppercaseTagsOfInterest = uppercaseTagsOfInterest;
+ this.tags = tags;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes atts)
+ throws SAXException {
+ super.startElement(uri, localName, qName, atts);
+ String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH);
+ if (uppercaseTagsOfInterest.contains(uc)) {
+ Integer i = tags.get(uc);
+ if (i == null) {
+ i = 1;
+ } else {
+ i++;
+ }
+ tags.put(uc, i);
+ }
+ }
+ }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
new file mode 100644
index 0000000..115976f
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.util;
+
+import java.util.Collections;
+import java.util.Map;
+
+public class ContentTags {
+
+ public static final ContentTags EMPTY_CONTENT_TAGS = new ContentTags();
+ final Map<String, Integer> tags;
+ final String content;
+ final boolean parseException;
+
+ private ContentTags() {
+ this("", Collections.EMPTY_MAP, false);
+ }
+
+ public ContentTags(String content) {
+ this(content, Collections.emptyMap(), false);
+ }
+
+ public ContentTags(String content, boolean parseException) {
+ this(content, Collections.emptyMap(), parseException);
+ }
+
+ public ContentTags(String content, Map<String, Integer> tags) {
+ this(content, tags, false);
+ }
+
+ private ContentTags(String content, Map<String, Integer> tags,
+ boolean parseException) {
+ this.content = content;
+ this.tags = tags;
+ this.parseException = parseException;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public Map<String, Integer> getTags() {
+ return tags;
+ }
+
+ public boolean getParseException() {
+ return parseException;
+ }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index de09fa1..ab3dfb2 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -17,7 +17,6 @@
package org.apache.tika.eval;
import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE;
-import static org.apache.tika.eval.AbstractProfiler.getContent;
import static org.apache.tika.eval.io.ExtractReader.IGNORE_LENGTH;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
@@ -38,11 +37,12 @@ import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
+import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.eval.util.LanguageIDWrapper;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.junit.Before;
+import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
@@ -53,18 +53,24 @@ import org.junit.Test;
public class SimpleComparerTest extends TikaTest {
private ExtractComparer comparer = null;
- private MockDBWriter writer = null;
+ private static MockDBWriter WRITER;
+
+ @BeforeClass
+ public static void staticSetUp() throws Exception {
+ WRITER = new MockDBWriter();
+ AbstractProfiler.loadCommonTokens(
+ Paths.get(SimpleComparerTest.class.getResource("/common_tokens").toURI()), "en");
+ LanguageIDWrapper.loadBuiltInModels();
+ }
@Before
public void setUp() throws Exception {
- writer = new MockDBWriter();
+ WRITER.clear();
comparer = new ExtractComparer(null, null,
Paths.get("extractsA"), Paths.get("extractsB"),
new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
IGNORE_LENGTH, IGNORE_LENGTH),
- writer);
- AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
- LanguageIDWrapper.loadBuiltInModels();
+ WRITER);
}
@Test
@@ -79,16 +85,14 @@ public class SimpleComparerTest extends TikaTest {
comparer.compareFiles(fpsA, fpsB);
- List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+ List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
Map<Cols, String> row = tableInfos.get(0);
- assertEquals("0", row.get(Cols.ID));
assertTrue(
row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A)
.startsWith("1,200: 1 | 120000: 1 | over: 1"));
- tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+ tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
row = tableInfos.get(0);
- assertEquals("0", row.get(Cols.ID));
assertEquals("70", row.get(Cols.CONTENT_LENGTH));
assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("14", row.get(Cols.NUM_TOKENS));
@@ -97,9 +101,8 @@ public class SimpleComparerTest extends TikaTest {
assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
- tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B);
+ tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_B);
row = tableInfos.get(0);
- assertEquals("0", row.get(Cols.ID));
assertEquals("76", row.get(Cols.CONTENT_LENGTH));
assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("13", row.get(Cols.NUM_TOKENS));
@@ -107,7 +110,7 @@ public class SimpleComparerTest extends TikaTest {
assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
- tableInfos = writer.getTable(ExtractComparer.PROFILES_A);
+ tableInfos = WRITER.getTable(ExtractComparer.PROFILES_A);
row = tableInfos.get(0);
assertEquals("2", row.get(Cols.NUM_PAGES));
@@ -125,7 +128,7 @@ public class SimpleComparerTest extends TikaTest {
comparer.compareFiles(fpsA, fpsB);
- List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+ List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
Map<Cols, String> row = tableInfos.get(0);
assertEquals("133", row.get(Cols.CONTENT_LENGTH));
@@ -154,7 +157,7 @@ public class SimpleComparerTest extends TikaTest {
comparer.compareFiles(fpsA, fpsB);
- List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+ List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
Map<Cols, String> row = tableInfos.get(0);
assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
@@ -174,9 +177,8 @@ public class SimpleComparerTest extends TikaTest {
getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
- List<Map<Cols, String>> table = writer.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+ List<Map<Cols, String>> table = WRITER.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
Map<Cols, String> row = table.get(0);
- //debugPrintRow(row);
assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
row.get(Cols.EXTRACT_EXCEPTION_ID));
}
@@ -184,24 +186,23 @@ public class SimpleComparerTest extends TikaTest {
@Test
public void testGetContent() throws Exception {
- Metadata m = new Metadata();
- m.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "0123456789");
+ ContentTags contentTags = new ContentTags("0123456789");
Map<Cols, String> data = new HashMap<>();
- String content = getContent(m, 10, data);
+ String content = AbstractProfiler.truncateContent(contentTags, 10, data);
assertEquals(10, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
- content = getContent(m, 4, data);
+ content = AbstractProfiler.truncateContent(contentTags, 4, data);
assertEquals(4, content.length());
assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test Metadata with no content
- content = getContent(new Metadata(), 10, data);
+ content = AbstractProfiler.truncateContent(ContentTags.EMPTY_CONTENT_TAGS, 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test null Metadata
- content = getContent(null, 10, data);
+ content = AbstractProfiler.truncateContent(null, 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
@@ -218,10 +219,9 @@ public class SimpleComparerTest extends TikaTest {
);
comparer.compareFiles(fpsA, fpsB);
for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) {
- List<Map<Cols, String>> table = writer.getTable(t);
+ List<Map<Cols, String>> table = WRITER.getTable(t);
Map<Cols, String> rowA = table.get(0);
- //debugPrintRow(rowA);
assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()),
rowA.get(Cols.PARSE_EXCEPTION_ID));
assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
@@ -229,7 +229,6 @@ public class SimpleComparerTest extends TikaTest {
}
}
-
@Test
public void testAttachmentCounts() {
List<Metadata> list = new ArrayList<>();
@@ -276,7 +275,7 @@ public class SimpleComparerTest extends TikaTest {
getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
- List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+ List<Map<Cols, String>> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS);
assertEquals(3, tableInfos.size());
for (int i = 0; i < tableInfos.size(); i++) {
assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
@@ -284,6 +283,69 @@ public class SimpleComparerTest extends TikaTest {
}
@Test
+ public void testTags() throws Exception {
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file15_tags.json"),
+ getResourceAsFile("/test-dirs/extractsA/file15_tags.json").toPath()
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file15_tags.html"),
+ getResourceAsFile("/test-dirs/extractsB/file15_tags.html").toPath());
+ comparer.compareFiles(fpsA, fpsB);
+ List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+ assertEquals(1, tableInfosA.size());
+ Map<Cols, String> tableInfoA = tableInfosA.get(0);
+ assertEquals("18", tableInfoA.get(Cols.TAGS_P));
+ assertEquals("1", tableInfoA.get(Cols.TAGS_DIV));
+ assertEquals("1", tableInfoA.get(Cols.TAGS_TITLE));
+
+ List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+ assertEquals(1, tableInfosB.size());
+ Map<Cols, String> tableInfoB = tableInfosB.get(0);
+ assertEquals("18", tableInfoB.get(Cols.TAGS_DIV));
+ assertEquals("1", tableInfoB.get(Cols.TAGS_IMG));
+ }
+
+ @Test
+ public void testBadTags() throws Exception {
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file16_badtags.json"),
+ getResourceAsFile("/test-dirs/extractsA/file16_badTags.json").toPath()
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file16_badtags.html"),
+ getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath());
+ comparer.compareFiles(fpsA, fpsB);
+ List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+ assertEquals(1, tableInfosA.size());
+ Map<Cols, String> tableInfoA = tableInfosA.get(0);
+ assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+
+ List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
+ assertEquals(1, tableInfosB.size());
+ Map<Cols, String> tableInfoB = tableInfosB.get(0);
+ //there actually is a tag problem, but tagsoup fixes it.
+ //this confirms behavior.
+ assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION));
+ }
+
+ @Test
+ public void testTagsOutOfOrder() throws Exception {
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file17_tagsOutOfOrder.json"),
+ getResourceAsFile("/test-dirs/extractsA/file17_tagsOutOfOrder.json").toPath()
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file16_badtags.html"),
+ getResourceAsFile("/test-dirs/extractsB/file16_badtags.html").toPath());
+ comparer.compareFiles(fpsA, fpsB);
+ List<Map<Cols, String>> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A);
+ assertEquals(1, tableInfosA.size());
+ Map<Cols, String> tableInfoA = tableInfosA.get(0);
+ assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+ }
+
+ @Test
@Ignore
public void testDebug() throws Exception {
Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
@@ -313,7 +375,7 @@ public class SimpleComparerTest extends TikaTest {
}
private void debugPrintTable(TableInfo tableInfo) {
- List<Map<Cols, String>> table = writer.getTable(tableInfo);
+ List<Map<Cols, String>> table = WRITER.getTable(tableInfo);
if (table == null) {
return;
}
@@ -337,10 +399,10 @@ public class SimpleComparerTest extends TikaTest {
}
@Test
- @Ignore("useful for testing 2 files not in test set")
+ //@Ignore("useful for testing 2 files not in test set")
public void oneOff() throws Exception {
- Path p1 = Paths.get("");
- Path p2 = Paths.get("");
+ Path p1 = Paths.get("C:\\Users\\tallison\\Downloads\\asfasdf\\AQRJRPYMH3PNNK2HLOOKKR4B3QOVWOUH_1_19_1.rar.json");
+ Path p2 = Paths.get("C:\\Users\\tallison\\Downloads\\asfasdf\\AQRJRPYMH3PNNK2HLOOKKR4B3QOVWOUH_1_20.rar.json");
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
new file mode 100644
index 0000000..5af73db
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file15_tags.json
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
new file mode 100644
index 0000000..5c6272e
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file16_badTags.json
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" meta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003c\u003c\u003c\u003c\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_pr [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
new file mode 100644
index 0000000..97afec8
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file17_tagsOutOfOrder.json
@@ -0,0 +1,41 @@
+[
+ {
+ "Content-Length": "34824",
+ "Content-Type": "application/pdf",
+ "Last-Modified": "2007-09-15T09:02:31Z",
+ "X-Parsed-By": [
+ "org.apache.tika.parser.DefaultParser",
+ "org.apache.tika.parser.pdf.PDFParser"
+ ],
+ "X-TIKA:content_handler": "ToXMLContentHandler",
+ "X-TIKA:content": "\u003chtml xmlns\u003d\"http://www.w3.org/1999/xhtml\"\u003e\n\u003chead\u003e\n\u003cmeta name\u003d\"pdf:PDFVersion\" content\u003d\"1.3\" /\u003e\n\u003cmeta name\u003d\"pdf:docinfo:title\" content\u003d\"Apache Tika - Apache Tika\" /\u003e\n\u003cmeta name\u003d\"xmp:CreatorTool\" content\u003d\"Firefox\" /\u003e\n\u003cmeta name\u003d\"access_permission:modify_annotations\" content\u003d\"true\" /\u003e\n\u003cmeta name\u003d\"access_permission:can_print_degra [...]
+ "X-TIKA:parse_time_millis": "500",
+ "access_permission:assemble_document": "true",
+ "access_permission:can_modify": "true",
+ "access_permission:can_print": "true",
+ "access_permission:can_print_degraded": "true",
+ "access_permission:extract_content": "true",
+ "access_permission:extract_for_accessibility": "true",
+ "access_permission:fill_in_form": "true",
+ "access_permission:modify_annotations": "true",
+ "dc:creator": "Bertrand DelacrΘtaz",
+ "dc:format": "application/pdf; version\u003d1.3",
+ "dc:title": "Apache Tika - Apache Tika",
+ "dcterms:created": "2007-09-15T09:02:31Z",
+ "dcterms:modified": "2007-09-15T09:02:31Z",
+ "meta:author": "Bertrand DelacrΘtaz",
+ "meta:creation-date": "2007-09-15T09:02:31Z",
+ "meta:save-date": "2007-09-15T09:02:31Z",
+ "pdf:PDFVersion": "1.3",
+ "pdf:docinfo:created": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:creator": "Bertrand DelacrΘtaz",
+ "pdf:docinfo:creator_tool": "Firefox",
+ "pdf:docinfo:modified": "2007-09-15T09:02:31Z",
+ "pdf:docinfo:producer": "Mac OS X 10.4.10 Quartz PDFContext",
+ "pdf:docinfo:title": "Apache Tika - Apache Tika",
+ "pdf:encrypted": "false",
+ "resourceName": "testPDF.pdf",
+ "xmp:CreatorTool": "Firefox",
+ "xmpTPg:NPages": "1"
+ }
+]
\ No newline at end of file
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
new file mode 100644
index 0000000..a08be46
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file15_tags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika - Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>
diff --git a/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
new file mode 100644
index 0000000..19ed27c
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsB/file16_badTags.html
@@ -0,0 +1,31 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<style type="text/css">
+.txt { white-space:nowrap; }
+#f0 { font-family:sans-serif; font-weight:normal; font-style:normal; }
+
+</style>
+</head>
+<body>
+<img id="background" style="position:absolute; left:0px; top:0px;" width="595" height="842" src="page1.png">
+<div class="txt" style="position:absolute; left:18px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache <i><b>bad tag</i></b>- Apache Tika</span></div>
+<div class="txt" style="position:absolute; left:449px; top:20px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">http://incubator.apache.org/tika/</span></div>
+<div class="txt" style="position:absolute; left:62px; top:77px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Tika - Content Analysis Toolkit</span></div>
+<div class="txt" style="position:absolute; left:57px; top:118px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is a toolkit for detecting and extracting metadata and structured text content</span></div>
+<div class="txt" style="position:absolute; left:57px; top:131px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">from various documents using existing parser libraries.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:154px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Apache Tika is an effort undergoing </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">incubation </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">at </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">The Apache So [...]
+<div class="txt" style="position:absolute; left:57px; top:167px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">sponsored by the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(50,101,169,1);">Apache Lucene PMC. </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">Incubation is required of all newly accepted projects</span></div>
+<div class="txt" style="position:absolute; left:57px; top:180px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">until a further review indicates that the infrastructure, communications, and decision making</span></div>
+<div class="txt" style="position:absolute; left:57px; top:193px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">process have stabilized in a manner consistent with other successful ASF projects. While</span></div>
+<div class="txt" style="position:absolute; left:57px; top:206px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">incubation status is not necessarily a reflection of the completeness or stability of the code, it</span></div>
+<div class="txt" style="position:absolute; left:57px; top:219px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">does indicate that the project has yet to be fully endorsed by the ASF.</span></div>
+<div class="txt" style="position:absolute; left:57px; top:242px;"><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">See the </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(60,106,152,1);">Apache Tika Incubation Status </span><span id="f27" style="font-size:10px;vertical-align:baseline;color:rgba(0,0,0,1);">page for the current incubation status.</span></div>
+<div class="txt" style="position:absolute; left:62px; top:289px;"><span id="f25" style="font-size:18px;vertical-align:baseline;color:rgba(152,0,0,1);">Latest News</span></div>
+<div class="txt" style="position:absolute; left:62px; top:333px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(152,0,0,1);">March 22nd, 2007: Apache Tika project started</span></div>
+<div class="txt" style="position:absolute; left:92px; top:344px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">The Apache Tika project was formally started when the </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169,1);">Tika proposal </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">was </span><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(50,101,169, [...]
+<div class="txt" style="position:absolute; left:92px; top:355px;"><span id="f27" style="font-size:9px;vertical-align:baseline;color:rgba(60,106,152,1);">Incubator PMC.</span></div>
+<div class="txt" style="position:absolute; left:18px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">1 of 1</span></div>
+<div class="txt" style="position:absolute; left:510px; top:792px;"><span id="f4" style="font-size:9px;vertical-align:baseline;color:rgba(0,0,0,1);">15.9.2007 11:02</span></div>
+</body>
+</html>