You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/07 15:10:11 UTC
[tika] branch 2.x updated: TIKA-2319 follow up
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new fce6626 TIKA-2319 follow up
fce6626 is described below
commit fce6626f2c7fc10840d51ccc9361a86fdd241d46
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Apr 7 11:10:01 2017 -0400
TIKA-2319 follow up
---
.../org/apache/tika/eval/AbstractProfiler.java | 18 +-
.../java/org/apache/tika/eval/ExtractProfiler.java | 8 +-
.../org/apache/tika/eval/XMLErrorLogUpdater.java | 6 +-
.../tika/eval/batch/EvalConsumerBuilder.java | 6 +-
.../main/java/org/apache/tika/eval/db/Cols.java | 8 +-
.../java/org/apache/tika/eval/db/MimeBuffer.java | 2 +-
.../src/main/resources/comparison-reports.xml | 273 ++++++++++-----------
tika-eval/src/main/resources/profile-reports.xml | 10 +-
8 files changed, 165 insertions(+), 166 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 1091537..d0a1a76 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -80,18 +80,18 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
static final long NON_EXISTENT_FILE_LENGTH = -1l;
public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
- new ColInfo(Cols.EXTRACT_EXCEPTION_TYPE_ID, Types.INTEGER),
+ new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
);
public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
- new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
);
public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
- new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
);
@@ -129,7 +129,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
public static TableInfo MIME_TABLE = new TableInfo("mimes",
- new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
);
@@ -217,7 +217,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
Map<Cols, String> data = new HashMap<>();
data.put(Cols.CONTAINER_ID, containerId);
data.put(Cols.FILE_PATH, filePath);
- data.put(Cols.EXTRACT_EXCEPTION_TYPE_ID, Integer.toString(type.ordinal()));
+ data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
writer.writeRow(extractExceptionTable, data);
}
@@ -419,18 +419,18 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
if (matcher.find()) {
- data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
return;
}
matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
if (matcher.find()) {
- data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
return;
}
- data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+ data.put(Cols.PARSE_EXCEPTION_ID,
Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
data.put(Cols.ORIG_STACK_TRACE, fullTrace);
@@ -562,7 +562,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
return;
}
int mimeId = writer.getMimeId(type);
- output.put(Cols.MIME_TYPE_ID, Integer.toString(mimeId));
+ output.put(Cols.MIME_ID, Integer.toString(mimeId));
}
void writeTokenCounts(Map<Cols, String> data, String field,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 9b7ddc4..514778f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -91,15 +91,15 @@ public class ExtractProfiler extends AbstractProfiler {
public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions",
new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
- new ColInfo(Cols.EXTRACT_EXCEPTION_TYPE_ID, Types.INTEGER),
- new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER)
+ new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)
);
public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
- new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER)
+ new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)
);
@@ -118,7 +118,7 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.LENGTH, Types.BIGINT),
new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
- new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER),
+ new ColInfo(Cols.MIME_ID, Types.INTEGER),
new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
index 499b6ac..a744b20 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -145,7 +145,7 @@ public class XMLErrorLogUpdater {
//if it does, update all records matching that path or container id
if (hitCount > 0) {
sql = "UPDATE " + errorTableName +
- " SET " + Cols.PARSE_ERROR_TYPE_ID +
+ " SET " + Cols.PARSE_ERROR_ID +
" = " + type.ordinal() + ","+
Cols.FILE_PATH + "='" +filePath+"'"+
" where "+Cols.CONTAINER_ID +
@@ -157,13 +157,13 @@ public class XMLErrorLogUpdater {
//insert full record
if (containerId > -1) {
sql = "INSERT INTO " + errorTableName +
- " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_TYPE_ID+")"+
+ " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
" values (" + containerId + ", '" + filePath + "'," +
type.ordinal() + ");";
} else {
//if container id == -1, insert only file path and parse error type id
sql = "INSERT INTO " + errorTableName +
- " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_TYPE_ID+")"+
+ " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
"values ('" + filePath + "'," +
type.ordinal() + ");";
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index 6e9b6c9..be0533a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -122,14 +122,14 @@ public abstract class EvalConsumerBuilder {
Map<Cols, String> m = new HashMap<>();
for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
m.clear();
- m.put(Cols.PARSE_ERROR_TYPE_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
}
for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
m.clear();
- m.put(Cols.PARSE_EXCEPTION_TYPE_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
}
@@ -137,7 +137,7 @@ public abstract class EvalConsumerBuilder {
for (ExtractReaderException.TYPE t :
ExtractReaderException.TYPE.values()) {
m.clear();
- m.put(Cols.EXTRACT_EXCEPTION_TYPE_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index 91917ec..e29598d 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -34,7 +34,7 @@ public enum Cols {
NUM_METADATA_VALUES,
IS_EMBEDDED,
EMBEDDED_FILE_PATH,
- MIME_TYPE_ID,
+ MIME_ID,
MD5,
NUM_ATTACHMENTS,
HAS_CONTENT,
@@ -68,19 +68,19 @@ public enum Cols {
DICE_COEFFICIENT,
//errors
- PARSE_ERROR_TYPE_ID,
+ PARSE_ERROR_ID,
PARSE_ERROR_DESCRIPTION,
PARSE_EXCEPTION_DESCRIPTION,
- EXTRACT_EXCEPTION_TYPE_ID,
+ EXTRACT_EXCEPTION_ID,
EXTRACT_EXCEPTION_DESCRIPTION,
//exceptions
ORIG_STACK_TRACE,
SORT_STACK_TRACE,
- PARSE_EXCEPTION_TYPE_ID,
+ PARSE_EXCEPTION_ID,
MIME_STRING,//string representation of mime type
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index 073dd63..3588622 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -38,7 +38,7 @@ public class MimeBuffer extends AbstractDBBuffer {
public MimeBuffer(Connection connection, TikaConfig config) throws SQLException {
st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " +
- Cols.MIME_TYPE_ID.name() + ", " +
+ Cols.MIME_ID.name() + ", " +
Cols.MIME_STRING.name() + ", " +
Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
this.config = config;
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index e59d474..59d6d5f 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -48,19 +48,19 @@
<!-- build mime indexes -->
<sql>create index if not exists pa_m_idx
- on profiles_a (mime_type_id);
+ on profiles_a (mime_id);
</sql>
<sql>
create index if not exists pb_m_idx
- on profiles_b (mime_type_id);
+ on profiles_b (mime_id);
</sql>
<!-- build exceptions comparison table -->
<sql>drop table if exists exceptions_compared</sql>
<sql>
create table exceptions_compared
- (mime_type_id_a integer, mime_type_id_b integer,
+ (mime_id_a integer, mime_id_b integer,
exceptions_a integer default 0,
total_a integer default 0,
percent_exceptions_a double default 0.0,
@@ -70,50 +70,50 @@
</sql>
<sql>
- insert into exceptions_compared (mime_type_id_a, mime_type_id_b)
- select ma.mime_type_id, mb.mime_type_id
+ insert into exceptions_compared (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
from profiles_a a
join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_type_id=a.mime_type_id
- join mimes mb on mb.mime_type_id=b.mime_type_id
- group by ma.mime_type_id, mb.mime_type_id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
</sql>
<sql>
update exceptions_compared ec set total_a=(
select count(1) as cnt from profiles_a pa
join profiles_b pb on pa.id=pb.id
- where pa.mime_type_id= ec.mime_type_id_a
- and pb.mime_type_id=ec.mime_type_id_b
- group by pa.mime_type_id, pb.mime_type_id);
+ where pa.mime_id= ec.mime_id_a
+ and pb.mime_id=ec.mime_id_b
+ group by pa.mime_id, pb.mime_id);
</sql>
<sql>
update exceptions_compared ec set total_b=(
select count(1) as cnt from profiles_b pb
join profiles_a pa on pa.id=pb.id
- where pa.mime_type_id= ec.mime_type_id_a
- and pb.mime_type_id=ec.mime_type_id_b
- group by pb.mime_type_id, pa.mime_type_id);
+ where pa.mime_id= ec.mime_id_a
+ and pb.mime_id=ec.mime_id_b
+ group by pb.mime_id, pa.mime_id);
</sql>
<sql>
update exceptions_compared ec set exceptions_a=
( select count(1) as cnt from exceptions_a ea
join profiles_a pa on ea.id=pa.id
join profiles_b pb on pa.id=pb.id
- where pa.mime_type_id= ec.mime_type_id_a
- and pb.mime_type_id=ec.mime_type_id_b
- and parse_exception_type_id=0
- group by pa.mime_type_id, pb.mime_type_id);
+ where pa.mime_id= ec.mime_id_a
+ and pb.mime_id=ec.mime_id_b
+ and parse_exception_id=0
+ group by pa.mime_id, pb.mime_id);
</sql>
<sql>
update exceptions_compared ec set exceptions_b=
( select count(1) as cnt from exceptions_b eb
join profiles_b pb on eb.id=pa.id
join profiles_a pa on pa.id=pb.id
- where pa.mime_type_id= ec.mime_type_id_a
- and pb.mime_type_id=ec.mime_type_id_b
- and parse_exception_type_id=0
- group by pb.mime_type_id, pa.mime_type_id);
+ where pa.mime_id= ec.mime_id_a
+ and pb.mime_id=ec.mime_id_b
+ and parse_exception_id=0
+ group by pb.mime_id, pa.mime_id);
</sql>
<sql>
@@ -130,12 +130,11 @@
</sql>
<!-- build tmp common words table -->
- <!-- build exceptions comparison table -->
<sql>drop table if exists token_counts_compared</sql>
<sql>
create table token_counts_compared
- (mime_type_id_a integer,
- mime_type_id_b integer,
+ (mime_id_a integer,
+ mime_id_b integer,
num_tokens_a integer default 0,
num_tokens_b integer default 0,
num_alphabetic_tokens_a integer default 0,
@@ -145,13 +144,13 @@
);
</sql>
<sql>
- insert into token_counts_compared (mime_type_id_a, mime_type_id_b)
- select ma.mime_type_id, mb.mime_type_id
+ insert into token_counts_compared (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
from profiles_a a
join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_type_id=a.mime_type_id
- join mimes mb on mb.mime_type_id=b.mime_type_id
- group by ma.mime_type_id, mb.mime_type_id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
</sql>
@@ -160,9 +159,9 @@
select sum(num_tokens) as cnt from profiles_a pa
join profiles_b pb on pa.id=pb.id
join contents_a c on c.id = pa.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -171,9 +170,9 @@
select sum(num_tokens) as cnt from profiles_b pb
join profiles_a pa on pa.id=pb.id
join contents_b c on c.id = pb.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -182,9 +181,9 @@
select sum(num_alphabetic_tokens) as cnt from profiles_a pa
join profiles_b pb on pa.id=pb.id
join contents_a c on c.id = pa.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -193,9 +192,9 @@
select sum(num_alphabetic_tokens) as cnt from profiles_b pb
join profiles_a pa on pb.id=pa.id
join contents_b c on c.id = pb.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -204,9 +203,9 @@
select sum(num_common_tokens) as cnt from profiles_a pa
join profiles_b pb on pa.id=pb.id
join contents_a c on c.id = pa.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -215,9 +214,9 @@
select sum(num_common_tokens) as cnt from profiles_b pb
join profiles_a pa on pa.id=pb.id
join contents_b c on c.id = pb.id
- where pb.mime_type_id= tcc.mime_type_id_b
- and pa.mime_type_id=tcc.mime_type_id_a
- group by mime_type_id_a, mime_type_id_b
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
);
</sql>
@@ -232,7 +231,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_a p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
group by mime_string
order by cnt desc
</sql>
@@ -246,7 +245,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_b p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
group by mime_string
order by cnt desc
</sql>
@@ -259,7 +258,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_a p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=false
group by mime_string
order by cnt desc
@@ -274,7 +273,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_b p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=false
group by mime_string
order by cnt desc
@@ -288,7 +287,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_a p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=true
group by mime_string
order by cnt desc
@@ -303,7 +302,7 @@
<sql>
select mime_string, count(1) cnt from
profiles_b p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=true
group by mime_string
order by cnt desc
@@ -319,9 +318,9 @@
MIME_A_TO_MIME_B, count(1) as COUNT
from profiles_a a
join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_type_id=a.mime_type_id
- join mimes mb on mb.mime_type_id=b.mime_type_id
- where a.mime_type_id <> b.mime_type_id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ where a.mime_id <> b.mime_id
group by MIME_A_TO_MIME_B
order by COUNT DESC
</sql>
@@ -337,10 +336,10 @@
MIME_A_TO_MIME_B, file_path, a.file_name
from profiles_a a
join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_type_id=a.mime_type_id
- join mimes mb on mb.mime_type_id=b.mime_type_id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
join containers c on a.container_id=c.container_id
- where a.mime_type_id <> b.mime_type_id
+ where a.mime_id <> b.mime_id
order by MIME_A_TO_MIME_B
</sql>
</report>
@@ -356,7 +355,7 @@
select mime_string, count(1) cnt from
exceptions_a e
join profiles_a p on p.id=e.id
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
group by mime_string
order by cnt desc
</sql>
@@ -370,7 +369,7 @@
select mime_string, count(1) cnt from
exceptions_b e
join profiles_b p on p.id=e.id
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
group by mime_string
order by cnt desc
</sql>
@@ -385,9 +384,9 @@
select mime_string, count(1) cnt from
exceptions_a e
join profiles_a p on p.id=e.id
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=false
- and parse_exception_type_id=0
+ and parse_exception_id=0
group by mime_string
order by cnt desc
</sql>
@@ -402,15 +401,15 @@
select mime_string, count(1) cnt from
exceptions_b e
join profiles_b p on p.id=e.id
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=false
- and parse_exception_type_id=0
+ and parse_exception_id=0
group by mime_string
order by cnt desc
</sql>
</report>
<report reportName="AllExceptionsByMimeByTypeA"
- reportFilename="exceptions/exceptions_by_mime_by_typeA.xlsx"
+ reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
format="xlsx"
includeSql="true">
@@ -420,16 +419,16 @@
from exceptions_a e
join profiles_a p on p.id=e.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
+ join mimes m on m.mime_id=p.mime_id
join ref_parse_exception_types r on
- r.parse_exception_type_id=e.parse_exception_type_id
- group by p.mime_type_id, parse_exception_description
+ r.parse_exception_id=e.parse_exception_id
+ group by p.mime_id, parse_exception_description
order by MIME_TYPE, EXCEPTION_TYPE
</sql>
</report>
<report reportName="AllExceptionsByMimeByTypeB"
- reportFilename="exceptions/exceptions_by_mime_by_typeB.xlsx"
+ reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
format="xlsx"
includeSql="true">
@@ -439,16 +438,16 @@
from exceptions_b e
join profiles_b p on p.id=e.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
+ join mimes m on m.mime_id=p.mime_id
join ref_parse_exception_types r on
- r.parse_exception_type_id=e.parse_exception_type_id
- group by p.mime_type_id, parse_exception_description
+ r.parse_exception_id=e.parse_exception_id
+ group by p.mime_id, parse_exception_description
order by MIME_TYPE, EXCEPTION_TYPE
</sql>
</report>
<report reportName="TextLostFromACausedByNewExceptionsInB"
- reportFilename="exceptions/textLostFromACausedByNewExceptionsInB.xlsx"
+ reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
format="xlsx"
includeSql="true">
@@ -476,7 +475,7 @@
</report>
<report reportName="FixedExceptionsInBByMimeType"
- reportFilename="exceptions/fixedExceptionsInBByMimeType.xlsx"
+ reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
format="xlsx"
includeSql="true">
@@ -487,15 +486,15 @@
join profiles_a pa on pa.id=ea.id
join profiles_b pb on pa.id=pb.id
join containers c on pa.container_id=c.container_id
- join mimes m on m.mime_type_id=pa.mime_type_id
+ join mimes m on m.mime_id=pa.mime_id
where eb.id is null
- and ea.parse_exception_type_id=0
+ and ea.parse_exception_id=0
group by mime_string
</sql>
</report>
<report reportName="FixedExceptionsInByDetails"
- reportFilename="exceptions/fixedExceptionsInBDetails.xlsx"
+ reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
format="xlsx"
includeSql="true">
<sql>
@@ -506,14 +505,14 @@
join profiles_a pa on pa.id=ea.id
join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs
join containers c on pa.container_id=c.container_id
- join mimes m on m.mime_type_id=pa.mime_type_id
+ join mimes m on m.mime_id=pa.mime_id
where eb.id is null
- and ea.parse_exception_type_id=0
+ and ea.parse_exception_id=0
order by mime_string
</sql>
</report>
<report reportName="ContentsOfFixedExceptionsInB"
- reportFilename="exceptions/contentsOfFixedExceptionsInB.xlsx"
+ reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
format="xlsx"
includeSql="true">
@@ -527,14 +526,14 @@
join profiles_a p on p.id=ea.id
join contents_b cb on cb.id=ea.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
+ join mimes m on m.mime_id=p.mime_id
where eb.id is null
- and ea.parse_exception_type_id=0
+ and ea.parse_exception_id=0
</sql>
</report>
<report reportName="NewExceptionsByMimeType"
- reportFilename="exceptions/newExceptionsInBByMimeType.xlsx"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
format="xlsx"
includeSql="true">
@@ -545,16 +544,16 @@
join profiles_a pa on pa.id=eb.id
join profiles_b pb on pb.id=pa.id
join containers c on pa.container_id=c.container_id
- join mimes m on m.mime_type_id=pa.mime_type_id
+ join mimes m on m.mime_id=pa.mime_id
where ea.id is null
- and eb.parse_exception_type_id=0
+ and eb.parse_exception_id=0
group by mime_string
order by COUNT desc
</sql>
</report>
<report reportName="NewExceptionsInBByMimeTypeByStackTrace"
- reportFilename="exceptions/newExceptionsInBByMimeTypeByStackTrace.xlsx"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
format="xlsx"
includeSql="true">
@@ -564,16 +563,16 @@
from exceptions_b eb
left join exceptions_a ea on ea.id = eb.id
join profiles_a p on p.id=eb.id
- join mimes m on m.mime_type_id=p.mime_type_id
+ join mimes m on m.mime_id=p.mime_id
where ea.id is null
- and eb.parse_exception_type_id=0
+ and eb.parse_exception_id=0
group by MIME_TYPE, eb.sort_stack_trace
order by MIME_TYPE asc, COUNT desc
</sql>
</report>
<report reportName="NewExceptionsInBDetails"
- reportFilename="exceptions/newExceptionsInBDetails.xlsx"
+ reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
format="xlsx"
includeSql="true">
@@ -584,15 +583,15 @@
left join exceptions_a ea on ea.id = eb.id
join profiles_a p on p.id=eb.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
+ join mimes m on m.mime_id=p.mime_id
where ea.id is null
- and eb.parse_exception_type_id=0
+ and eb.parse_exception_id=0
order by MIME_TYPE asc, eb.ORIG_STACK_TRACE
</sql>
</report>
<report reportName="StackTracesByMimeInA"
- reportFilename="exceptions/stackTracesByMimeInA.xlsx"
+ reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
format="xlsx"
includeSql="true">
@@ -601,15 +600,15 @@
COUNT
from exceptions_a e
join profiles_a p on p.id=e.id
- join mimes m on m.mime_type_id=p.mime_type_id
- and e.parse_exception_type_id=0
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
group by MIME_TYPE, e.sort_stack_trace
order by MIME_TYPE asc, COUNT desc
</sql>
</report>
<report reportName="AllStackTracesInA"
- reportFilename="exceptions/stackTracesInA.xlsx"
+ reportFilename="exceptions/stack_traces_A.xlsx"
format="xlsx"
includeSql="true">
@@ -619,14 +618,14 @@
from exceptions_a e
join profiles_a p on p.id=e.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
- and e.parse_exception_type_id=0
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
FILE_LENGTH asc
</sql>
</report>
<report reportName="AllStackTracesInB"
- reportFilename="exceptions/stackTracesInB.xlsx"
+ reportFilename="exceptions/stack_traces_B.xlsx"
format="xlsx"
includeSql="true">
@@ -636,15 +635,15 @@
from exceptions_b e
join profiles_b p on p.id=e.id
join containers c on p.container_id=c.container_id
- join mimes m on m.mime_type_id=p.mime_type_id
- and e.parse_exception_type_id=0
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
FILE_LENGTH asc
</sql>
</report>
<report reportName="StackTracesByMimeInB"
- reportFilename="exceptions/stackTracesByMimeInB.xlsx"
+ reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
format="xlsx"
includeSql="true">
@@ -653,8 +652,8 @@
COUNT
from exceptions_b e
join profiles_b p on p.id=e.id
- join mimes m on m.mime_type_id=p.mime_type_id
- and e.parse_exception_type_id=0
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
group by MIME_TYPE, e.sort_stack_trace
order by MIME_TYPE asc, COUNT desc
</sql>
@@ -667,7 +666,7 @@
select file_path, extract_exception_description
from extract_exceptions_a e
join ref_extract_exception_types t
- on e.extract_exception_type_id=t.extract_exception_type_id
+ on e.extract_exception_id=t.extract_exception_id
</sql>
</report>
<report reportName="extractExceptionsB"
@@ -678,7 +677,7 @@
select file_path, extract_exception_description
from extract_exceptions_b e
join ref_extract_exception_types t
- on e.extract_exception_type_id=t.extract_exception_type_id
+ on e.extract_exception_id=t.extract_exception_id
</sql>
</report>
<report reportName="parseExceptionTypesA"
@@ -689,8 +688,8 @@
select parse_exception_description, count(1)
from exceptions_a e
join ref_parse_exception_types t on
- t.parse_exception_type_id=e.parse_exception_type_id
- group by e.parse_exception_type_id
+ t.parse_exception_id=e.parse_exception_id
+ group by e.parse_exception_id
</sql>
</report>
<report reportName="parseExceptionTypesB"
@@ -701,8 +700,8 @@
select parse_exception_description, count(1)
from exceptions_b e
join ref_parse_exception_types t on
- t.parse_exception_type_id=e.parse_exception_type_id
- group by e.parse_exception_type_id
+ t.parse_exception_id=e.parse_exception_id
+ group by e.parse_exception_id
</sql>
</report>
@@ -738,15 +737,15 @@
join profiles_a pa on pa.id = cc.id
join profiles_b pb on pb.id=cc.id
join containers c on c.container_id=pa.container_id
- join mimes ma on ma.mime_type_id=pa.mime_type_id
- join mimes mb on mb.mime_type_id=pb.mime_type_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
left join exceptions_a ea on ea.id=cc.id
left join exceptions_b eb on eb.id=cc.id
where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
- and (ea.parse_exception_type_id is null or
- ea.parse_exception_type_id <> 2)
- and (eb.parse_exception_type_id is null or
- eb.parse_exception_type_id <> 2)
+ and (ea.parse_exception_id is null or
+ ea.parse_exception_id <> 2)
+ and (eb.parse_exception_id is null or
+ eb.parse_exception_id <> 2)
order by ma.mime_string, overlap asc
limit 100000
</sql>
@@ -783,13 +782,13 @@
join profiles_a pa on pa.id = cc.id
join profiles_b pb on pb.id=cc.id
join containers c on c.container_id=pa.container_id
- join mimes ma on ma.mime_type_id=pa.mime_type_id
- join mimes mb on mb.mime_type_id=pb.mime_type_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
left join exceptions_a ea on ea.id=cc.id
left join exceptions_b eb on eb.id=cc.id
where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
- and (ea.parse_exception_type_id is null)
- and (eb.parse_exception_type_id is null)
+ and (ea.parse_exception_id is null)
+ and (eb.parse_exception_id is null)
order by ma.mime_string, overlap asc
limit 100000
</sql>
@@ -807,8 +806,8 @@
num_common_tokens_a, num_common_tokens_b,
ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as change_in_common_tokens_b
from token_counts_compared tcc
- join mimes ma on tcc.mime_type_id_a = ma.mime_type_id
- join mimes mb on tcc.mime_type_id_b = mb.mime_type_id
+ join mimes ma on tcc.mime_id_a = ma.mime_id
+ join mimes mb on tcc.mime_id_b = mb.mime_id
order by change_in_common_tokens_b desc
</sql>
</report>
@@ -824,8 +823,8 @@
total_a, percent_exceptions_a,
exceptions_b, total_b, percent_exceptions_b
from exceptions_compared c
- join mimes ma on ma.mime_type_id=c.mime_type_id_a
- join mimes mb on mb.mime_type_id=c.mime_type_id_b
+ join mimes ma on ma.mime_id=c.mime_id_a
+ join mimes mb on mb.mime_id=c.mime_id_b
order by percent_exceptions_b desc, total_b desc;
</sql>
</report>
@@ -898,18 +897,18 @@
mb.mime_string as mime_string_b,
pa.num_attachments as num_attachments_a,
pb.num_attachments as num_attachments_b,
- ea.parse_exception_type_id as exception_type_id_a,
- eb.parse_exception_type_id as exception_type_id_b
+ ea.parse_exception_id as exception_id_a,
+ eb.parse_exception_id as exception_id_b
from profiles_a pa
join profiles_b pb on pa.id= pb.id
join containers c on pa.container_id=c.container_id
- join mimes ma on pa.mime_type_id=ma.mime_type_id
- join mimes mb on pb.mime_type_id=mb.mime_type_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
left join exceptions_a ea on ea.id=pa.id
left join exceptions_b eb on eb.id=pb.id
where pa.is_embedded=false and
- ea.parse_exception_type_id is null and
- eb.parse_exception_type_id is null
+ ea.parse_exception_id is null and
+ eb.parse_exception_id is null
and pa.num_attachments <> pb.num_attachments
order by ma.mime_string, pb.num_attachments-pa.num_attachments
limit 1000;
@@ -928,18 +927,18 @@
mb.mime_string as mime_string_b,
pa.num_metadata_values as num_metadata_values_a,
pb.num_metadata_values as num_metadata_values_b,
- ea.parse_exception_type_id as parse_ex_type_id_a,
- eb.parse_exception_type_id as parse_ex_type_id_b
+ ea.parse_exception_id as parse_ex_id_a,
+ eb.parse_exception_id as parse_ex_id_b
from profiles_a pa
join profiles_b pb on pa.id= pb.id
join containers c on pa.container_id=c.container_id
- join mimes ma on pa.mime_type_id=ma.mime_type_id
- join mimes mb on pb.mime_type_id=mb.mime_type_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
left join exceptions_a ea on ea.id=pa.id
left join exceptions_b eb on eb.id=pb.id
where
- ea.parse_exception_type_id is null and
- eb.parse_exception_type_id is null
+ ea.parse_exception_id is null and
+ eb.parse_exception_id is null
and pa.num_metadata_values <> pb.num_metadata_values
order by ma.mime_string,
pb.num_metadata_values-pa.num_metadata_values
diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index 1f9be6a..87642fd 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -35,7 +35,7 @@
<sql>
select mime_string, count(1) cnt from
profiles p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
group by mime_string
order by cnt desc
</sql>
@@ -48,7 +48,7 @@
<sql>
select mime_string, count(1) cnt from
profiles p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=false
group by mime_string
order by cnt desc
@@ -63,7 +63,7 @@
<sql>
select mime_string, count(1) cnt from
profiles p
- join mimes m on m.mime_type_id = p.mime_type_id
+ join mimes m on m.mime_id = p.mime_id
where is_embedded=true
group by mime_string
order by cnt desc
@@ -119,7 +119,7 @@
select parse_exception_description, count(1) cnt
from parse_exceptions e
join profiles p on p.id = e.id
- join ref_parse_exception_types et on et.parse_exception_type_id=e.parse_exception_type_id
+ join ref_parse_exception_types et on et.parse_exception_id=e.parse_exception_id
group by parse_exception_description
order by cnt desc;
</sql>
@@ -135,7 +135,7 @@
select parse_exception_description, count(1) cnt
from parse_exceptions e
join profiles p on p.id = e.id
- join ref_parse_exception_types et on et.parse_exception_type_id=e.parse_exception_type_id
+ join ref_parse_exception_types et on et.parse_exception_id=e.parse_exception_id
where is_embedded=true
group by parse_exception_description
order by cnt desc;
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].