You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/30 18:05:32 UTC
[tika] branch master updated: TIKA-2822 -- remove common >=4 letter
html markup entities
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 8c22f05 TIKA-2822 -- remove common >=4 letter html markup entities
8c22f05 is described below
commit 8c22f054ea94526e6d22a3f4c923e0b8724f2831
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jan 30 13:05:09 2019 -0500
TIKA-2822 -- remove common >=4 letter html markup entities
---
.../tika/eval/tools/TopCommonTokenCounter.java | 30 +++++++++++++++++++++-
tika-eval/src/main/resources/common_tokens/ar | 4 +--
tika-eval/src/main/resources/common_tokens/bn | 18 ++++++-------
tika-eval/src/main/resources/common_tokens/de | 18 ++++++-------
tika-eval/src/main/resources/common_tokens/el | 18 ++++++-------
tika-eval/src/main/resources/common_tokens/en | 26 +++++++++----------
tika-eval/src/main/resources/common_tokens/es | 14 +++++-----
tika-eval/src/main/resources/common_tokens/fa | 16 ++++++------
tika-eval/src/main/resources/common_tokens/fr | 22 ++++++++--------
tika-eval/src/main/resources/common_tokens/he | 6 ++---
tika-eval/src/main/resources/common_tokens/hi | 24 ++++++++---------
tika-eval/src/main/resources/common_tokens/id | 22 ++++++++--------
tika-eval/src/main/resources/common_tokens/it | 20 +++++++--------
tika-eval/src/main/resources/common_tokens/ja | 6 ++---
tika-eval/src/main/resources/common_tokens/ko | 4 +--
tika-eval/src/main/resources/common_tokens/nl | 16 ++++++------
tika-eval/src/main/resources/common_tokens/pt | 18 ++++++-------
tika-eval/src/main/resources/common_tokens/ru | 8 +++---
tika-eval/src/main/resources/common_tokens/ur | 18 ++++++-------
tika-eval/src/main/resources/common_tokens/vi | 28 ++++++++++----------
tika-eval/src/main/resources/common_tokens/zh-cn | 2 +-
tika-eval/src/main/resources/common_tokens/zh-tw | 2 +-
22 files changed, 184 insertions(+), 156 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
index 5fdd7e8..4d76b7c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
@@ -56,6 +56,9 @@ import org.apache.tika.eval.tokens.AnalyzerManager;
*
* The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
* but includes bigrams for cjk.
+ *
+ * It also has a white list for __email__ and __url__ and a black list
+ * for common html markup terms.
*/
public class TopCommonTokenCounter {
private static final String FIELD = "f";
@@ -69,6 +72,31 @@ public class TopCommonTokenCounter {
}
));
+ //words to ignore
+ //these are common 4 letter html markup words that we do
+ //not want to count in case of failed markup processing.
+ //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
+ static Set<String> BLACK_LIST = new HashSet<>(Arrays.asList(
+ "span",
+ "table",
+ "href",
+ "head",
+ "title",
+ "body",
+ "html",
+ "tagname",
+ "lang",
+ "style",
+ "script",
+ "strong",
+ "blockquote",
+ "form",
+ "iframe",
+ "section",
+ "colspan",
+ "rowspan"
+ ));
+
public static void main(String[] args) throws Exception {
Path inputFile = Paths.get(args[0]);
Path commonTokensFile = Paths.get(args[1]);
@@ -127,7 +155,7 @@ public class TopCommonTokenCounter {
if (queue.top() == null || queue.size() < TOP_N ||
df >= queue.top().df) {
String t = bytesRef.utf8ToString();
- if (! WHITE_LIST.contains(t)) {
+ if (! WHITE_LIST.contains(t) && ! BLACK_LIST.contains(t)) {
queue.insertWithOverflow(new TokenDFTF(t, df, tf));
}
diff --git a/tika-eval/src/main/resources/common_tokens/ar b/tika-eval/src/main/resources/common_tokens/ar
index c99bf24..5a9a6c4 100644
--- a/tika-eval/src/main/resources/common_tokens/ar
+++ b/tika-eval/src/main/resources/common_tokens/ar
@@ -6975,7 +6975,6 @@ center
سمات
لوحظ
مسافات
-html
الحافظ
اواسط
اياه
@@ -9605,7 +9604,6 @@ school
عينيه
مرجع
ولاد
-style
الابنه
الامني
الانخفاض
@@ -20000,3 +19998,5 @@ california
شنقا
طرفه
فانتقل
+قابوس
+لاسبانيا
diff --git a/tika-eval/src/main/resources/common_tokens/bn b/tika-eval/src/main/resources/common_tokens/bn
index 64558be..950d7e5 100644
--- a/tika-eval/src/main/resources/common_tokens/bn
+++ b/tika-eval/src/main/resources/common_tokens/bn
@@ -3635,7 +3635,6 @@ film
রাননা
রোডে
সংসদে
-html
অনুরোধে
ডানার
পাযনি
@@ -9035,7 +9034,6 @@ editions
illustrated
kingdom
period
-section
অবমুকত
অভযনতরের
আওতাভুকত
@@ -10146,7 +10144,6 @@ germany
awards
editor
sanskrit
-title
অনযভাবে
অবহেলিত
অসফল
@@ -10282,7 +10279,6 @@ where
সালতানাত
সুষম
হতাশা
-body
standard
অনিবারয
অবদি
@@ -11109,7 +11105,6 @@ technical
cinema
class
eastern
-form
sites
translations
অধযযনকালীন
@@ -11208,7 +11203,6 @@ independent
past
services
song
-style
well
whois
wiki
@@ -11926,7 +11920,6 @@ archived
known
lives
roerich
-table
অঞজন
অবযহতি
অবিশবাসয
@@ -14533,7 +14526,6 @@ disease
educational
function
geographic
-head
johnson
leiden
master
@@ -17718,7 +17710,6 @@ hand
highest
http
java
-lang
legal
market
metropolitan
@@ -20000,3 +19991,12 @@ wars
কযামবরিজের
করমদকষতা
করমীকে
+কলাকৌশল
+কলাসিকাল
+কলোরিন
+কারযপদধতি
+কুচবিহার
+কুলসুম
+কুশিযারা
+কৃষণনগরের
+কেরিযার
diff --git a/tika-eval/src/main/resources/common_tokens/de b/tika-eval/src/main/resources/common_tokens/de
index a163e83..517e5cc 100644
--- a/tika-eval/src/main/resources/common_tokens/de
+++ b/tika-eval/src/main/resources/common_tokens/de
@@ -228,7 +228,6 @@ soll
land
sollte
darauf
-lang
ubernahm
lage
weltkrieg
@@ -239,7 +238,6 @@ innerhalb
davon
fuhrt
frankreich
-form
quellen
frau
gruppe
@@ -3124,7 +3122,6 @@ konzert
trugen
last
norbert
-html
kunden
sekunden
stiess
@@ -6701,7 +6698,6 @@ camp
komplexen
bilanz
lokomotiven
-head
bindung
pays
tisch
@@ -8235,6 +8231,7 @@ landsmann
moral
abfahrt
geraden
+__toc__
poesie
beliebten
adelsfamilie
@@ -9026,7 +9023,6 @@ absoluten
magic
eisenbahnen
ereignete
-body
giorgio
grabmal
verwandelt
@@ -9988,6 +9984,7 @@ telefon
fehlten
hallen
verkehrten
+__noeditsection__
empfanger
mosel
geschutztes
@@ -10326,7 +10323,6 @@ railroad
universidad
else
sozialdemokraten
-style
utah
bauingenieur
gebrauchliche
@@ -12813,7 +12809,6 @@ geometrischen
reuter
libyen
reportage
-section
archivs
autounfall
energien
@@ -12921,7 +12916,6 @@ ersichtlich
iwanowitsch
klee
local
-span
ruder
aufstiegsrunde
stadtbefestigung
@@ -16585,6 +16579,7 @@ verwirrung
gernot
handelns
nutzlich
+oibook_result
sudteil
wasserstand
visuellen
@@ -17220,7 +17215,6 @@ dramatisch
irgendwo
popularsten
schulung
-table
zubereitung
belassen
eichenlaub
@@ -20000,3 +19994,9 @@ gepfarrt
individuums
phrase
priestern
+vorstellte
+witte
+wolga
+lampe
+matrosen
+niedersachsens
diff --git a/tika-eval/src/main/resources/common_tokens/el b/tika-eval/src/main/resources/common_tokens/el
index 4124796..05a8af1 100644
--- a/tika-eval/src/main/resources/common_tokens/el
+++ b/tika-eval/src/main/resources/common_tokens/el
@@ -4335,7 +4335,6 @@ smith
απαγορευση
πηγων
χριστιανων
-html
αρχηγου
βραζιλιασ
δημιουργουνται
@@ -6710,7 +6709,6 @@ frederick
μιση
πεποιθησεισ
ρωμαικου
-style
εξωτερικησ
θυσια
μακεδονικου
@@ -7907,7 +7905,6 @@ political
χαριν
council
jacques
-lang
αθηναιουσ
ασκειται
διεξαγει
@@ -10153,7 +10150,6 @@ department
φιλοδοξιεσ
χαοσ
domain
-head
literature
αναλογωσ
θερετρο
@@ -11114,7 +11110,6 @@ republic
σωζομενα
τσεχοσλοβακιασ
φυλασσονται
-colspan
never
penguin
γεωμετρικη
@@ -13799,7 +13794,6 @@ army
birds
ferdinand
mario
-table
αλγεβρα
ανεβαινουν
απογοητευμενοσ
@@ -13959,7 +13953,6 @@ ultimate
συμφωνιων
υπογραφτηκε
χωρισμο
-body
heritage
springer
ακτινασ
@@ -16350,7 +16343,6 @@ bulletin
estadio
etudes
look
-title
village
αθλητικουσ
αλικησ
@@ -18645,7 +18637,6 @@ demo
gregory
hamlyn
klaus
-section
temple
wisconsin
αβεβαιοτητα
@@ -20000,3 +19991,12 @@ snow
κοκκινησ
λορεντσο
μαξιμιλιανου
+μαχαιρια
+μεταβληθηκε
+μεταγραφεσ
+μεταλλουργια
+μπαλκονι
+μπερκ
+ναυπηγηση
+νευτωνα
+οικοδομικη
diff --git a/tika-eval/src/main/resources/common_tokens/en b/tika-eval/src/main/resources/common_tokens/en
index 04ce8d2..f1cb892 100644
--- a/tika-eval/src/main/resources/common_tokens/en
+++ b/tika-eval/src/main/resources/common_tokens/en
@@ -192,7 +192,6 @@ system
become
different
given
-form
went
established
came
@@ -220,7 +219,6 @@ play
currently
period
originally
-head
president
young
track
@@ -233,7 +231,6 @@ works
once
joined
using
-title
river
making
community
@@ -388,7 +385,6 @@ release
radio
research
previously
-style
night
help
remained
@@ -436,7 +432,6 @@ round
charles
designed
court
-body
province
largest
michael
@@ -552,7 +547,6 @@ spent
peter
finally
introduced
-strong
running
nine
complete
@@ -593,7 +587,6 @@ native
blue
recent
administrative
-section
biography
culture
plays
@@ -1211,7 +1204,6 @@ democratic
reason
francisco
kind
-table
growth
hour
territory
@@ -3191,7 +3183,6 @@ quick
sing
influences
louisiana
-script
fighter
hungarian
recipient
@@ -4291,7 +4282,6 @@ palm
associates
finnish
diplomatic
-span
innovation
random
vegas
@@ -7234,7 +7224,6 @@ stealing
uganda
satisfied
confrontation
-lang
trailer
flooded
loans
@@ -11523,7 +11512,6 @@ illuminated
rockefeller
surgeons
hydroelectric
-colspan
rusty
wolverhampton
balkans
@@ -13334,7 +13322,6 @@ cradle
engraver
espana
fanny
-html
magnum
cathy
classify
@@ -13796,6 +13783,7 @@ ardent
beautifully
befriends
nana
+__toc__
atoll
ballistic
collier
@@ -20000,3 +19988,15 @@ sore
whitfield
adventist
breaches
+councilman
+cristo
+extremist
+fitch
+fondness
+madeline
+neuronal
+scramble
+sigismund
+undermining
+wladyslaw
+bette
diff --git a/tika-eval/src/main/resources/common_tokens/es b/tika-eval/src/main/resources/common_tokens/es
index af172da..f707d94 100644
--- a/tika-eval/src/main/resources/common_tokens/es
+++ b/tika-eval/src/main/resources/common_tokens/es
@@ -7106,7 +7106,6 @@ sensible
haciendose
cubren
asistencias
-html
plus
afirmar
disenar
@@ -8504,7 +8503,6 @@ rechazar
arkansas
women
demarcacion
-head
sujeta
chiapas
fanaticos
@@ -8981,7 +8979,6 @@ march
exigencias
lozano
colega
-style
hibrido
muller
lanzando
@@ -10273,7 +10270,6 @@ confirmacion
friends
peregrinos
prometido
-lang
lomas
funk
invadir
@@ -12702,7 +12698,6 @@ inmigrante
mango
orquidea
alternas
-body
mohamed
mudejar
analysis
@@ -16715,7 +16710,6 @@ pumas
quieres
serbios
solteros
-table
adscrita
cronologicamente
entregan
@@ -17465,7 +17459,6 @@ doblado
mendizabal
resiste
simetrica
-strong
superviso
timido
tributarios
@@ -19518,6 +19511,7 @@ dinamicos
entendia
finn
maiden
+oibook_result
regido
sanguinea
saturnino
@@ -20000,3 +19994,9 @@ programar
reducciones
sucesivo
suroriental
+vocablos
+badalona
+cilindricas
+coordinada
+cunada
+elio
diff --git a/tika-eval/src/main/resources/common_tokens/fa b/tika-eval/src/main/resources/common_tokens/fa
index 6cedfdc..f63d3bc 100644
--- a/tika-eval/src/main/resources/common_tokens/fa
+++ b/tika-eval/src/main/resources/common_tokens/fa
@@ -4949,7 +4949,6 @@ gmina
الهيات
كارامد
مورگان
-html
خودروسازي
ماهواره
مغولستان
@@ -11097,7 +11096,6 @@ report
هرمي
همانا
پايينتري
-lang
ابرانساني
الزمان
بردارند
@@ -13986,7 +13984,6 @@ pictures
carolina
death
golden
-section
اتفاقا
اروپاست
ازدحام
@@ -14950,7 +14947,7 @@ while
گراديان
گنبدهاي
گولد
-head
+__toc__
heart
party
principles
@@ -15993,11 +15990,8 @@ organization
گدانسك
گيريهاي
advanced
-body
england
-form
nova
-table
احترامي
ارايهاي
اشتها
@@ -18152,7 +18146,6 @@ catalog
florida
role
selected
-style
tehran
than
ابوتراب
@@ -20000,3 +19993,10 @@ provincia
نازكتر
ناكارامدي
نانا
+نهادهايي
+نيمرخ
+هابيت
+هاليفاكس
+هدفشان
+همجوشي
+ولپي
diff --git a/tika-eval/src/main/resources/common_tokens/fr b/tika-eval/src/main/resources/common_tokens/fr
index 43a8f7f..216eaca 100644
--- a/tika-eval/src/main/resources/common_tokens/fr
+++ b/tika-eval/src/main/resources/common_tokens/fr
@@ -461,7 +461,6 @@ superieur
gauche
voit
suisse
-style
epoque
district
journal
@@ -710,7 +709,6 @@ limite
russe
comite
nature
-section
elections
central
laisse
@@ -1720,7 +1718,6 @@ eugene
regles
completement
remonte
-table
fabrication
salon
orientale
@@ -4588,7 +4585,6 @@ gravure
yougoslavie
ligues
couts
-lang
athenes
souris
gravement
@@ -6701,7 +6697,6 @@ wagner
yaounde
kilometrique
pension
-html
brothers
beaumont
effort
@@ -7642,7 +7637,6 @@ entrainement
commander
recueille
cyril
-head
citadelle
divin
roussillon
@@ -9490,7 +9484,6 @@ mickey
nettoyage
search
audiences
-script
sexes
martha
corne
@@ -10921,7 +10914,6 @@ decrocher
mutuelle
projections
thiers
-body
excessive
romanes
dejeuner
@@ -13070,7 +13062,6 @@ prefets
therapie
chantent
posture
-rowspan
sillage
assemblage
barque
@@ -15394,7 +15385,6 @@ seguin
sinclair
traduisent
vieillard
-colspan
continentaux
corrigee
cynthia
@@ -16316,7 +16306,6 @@ forgeron
mickael
presentaient
pretent
-strong
subtropicales
trancher
binomes
@@ -18166,6 +18155,7 @@ cornet
creerent
denombrait
gracie
+oibook_result
reproduisent
rosette
beast
@@ -20000,3 +19990,13 @@ agresse
bovine
cantique
decime
+dimes
+itinerant
+laiton
+oued
+prisee
+voutees
+borg
+brusque
+camillo
+finalise
diff --git a/tika-eval/src/main/resources/common_tokens/he b/tika-eval/src/main/resources/common_tokens/he
index d0f9acc..d3e84e3 100644
--- a/tika-eval/src/main/resources/common_tokens/he
+++ b/tika-eval/src/main/resources/common_tokens/he
@@ -1703,7 +1703,6 @@ world
ישיבה
נוסד
פוליטי
-html
אליהו
בריטים
חומרים
@@ -19120,8 +19119,6 @@ pages
שירש
שסברו
תיאורו
-head
-lang
mountain
אבקת
אושפז
@@ -19499,6 +19496,7 @@ archive
שיין
שניהלו
שפיתחה
+__toc__
בילו
בסיסה
ברנט
@@ -20000,3 +19998,5 @@ make
מחירו
מירה
מתוארכת
+פירוקה
+שאורך
diff --git a/tika-eval/src/main/resources/common_tokens/hi b/tika-eval/src/main/resources/common_tokens/hi
index bc3aacd..ed7d28c 100644
--- a/tika-eval/src/main/resources/common_tokens/hi
+++ b/tika-eval/src/main/resources/common_tokens/hi
@@ -3831,7 +3831,6 @@ science
समारोहों
सवाधीनता
सॄषटिकरता
-html
अपरतयकष
अरचना
गोविनद
@@ -7718,7 +7717,6 @@ works
हरमन
human
open
-style
अधिकांशत
अभिभावक
अशलील
@@ -9449,10 +9447,8 @@ magazine
सातवाँ
सौतेली
basic
-colspan
corporation
electric
-rowspan
union
western
अजरबैजान
@@ -10640,7 +10636,6 @@ using
हंटर
हिमपात
हैलोजनीकृत
-body
light
method
अकषुणण
@@ -11108,8 +11103,6 @@ height
lake
last
learning
-section
-table
uefa
अंडों
अखणड
@@ -12462,7 +12455,6 @@ chromosome
class
cross
disease
-form
अकेडमी
अदालती
अभिगरहण
@@ -13575,6 +13567,7 @@ very
वारमिंग
विकरण
वितरकों
+विदया_डॉट_कॉम
विनोबा
शरोणि
शारीर
@@ -15347,7 +15340,6 @@ daily
equation
help
hlen
-lang
operation
personal
photographs
@@ -15535,7 +15527,6 @@ performance
season
square
stanford
-title
tourism
yoga
अंतरीप
@@ -15925,7 +15916,6 @@ conference
december
discussion
empire
-head
lecture
maria
organisation
@@ -18230,6 +18220,7 @@ court
glass
hand
ieee
+oibook_result
professional
smith
unesco
@@ -19512,7 +19503,6 @@ object
prime
resolution
rules
-script
seven
simulation
status
@@ -20000,3 +19990,13 @@ wild
पैरोकार
पॉपुलर
पोखर
+फंसी
+फरहा
+फलता
+फलेवर
+फुलर
+फेंककर
+फेमिना
+फोरथ
+बंसी
+बढाये
diff --git a/tika-eval/src/main/resources/common_tokens/id b/tika-eval/src/main/resources/common_tokens/id
index ce7470d..e2981e6 100644
--- a/tika-eval/src/main/resources/common_tokens/id
+++ b/tika-eval/src/main/resources/common_tokens/id
@@ -3163,7 +3163,6 @@ obatan
pengukuran
samudera
berakhirnya
-html
kemanusiaan
ketimbang
teratas
@@ -4913,7 +4912,6 @@ areal
bermotor
desainer
field
-head
logistik
masak
napoleon
@@ -5427,7 +5425,6 @@ membubarkan
myanmar
pengantin
administrator
-body
bundar
eksplisit
pelawak
@@ -5457,7 +5454,6 @@ disponsori
henri
keraton
stanley
-style
teritori
terlestarikan
convention
@@ -7218,7 +7214,6 @@ creative
dibentuklah
dipertanyakan
hello
-lang
menugaskan
mesopotamia
penembakan
@@ -7439,7 +7434,6 @@ monica
penghujung
pengkajian
perbedaannya
-section
tercermin
yudaisme
diasosiasikan
@@ -8636,7 +8630,6 @@ pematang
schmidt
sekarat
sophia
-table
tanaka
transliterasi
watanabe
@@ -9725,7 +9718,6 @@ berjaya
berkuliah
diisolasi
episodenya
-form
gloria
haiti
ketaatan
@@ -10143,7 +10135,6 @@ sanitasi
sekitaran
sosiolog
spektakuler
-title
ying
buntu
carol
@@ -11997,9 +11988,7 @@ rebecca
rhodes
rsud
sarawak
-script
simetri
-strong
talmud
terompet
timotius
@@ -19345,6 +19334,7 @@ twelve
ulet
veneto
wesel
+__indeks__
acarnidae
anjingnya
antoinette
@@ -20000,3 +19990,13 @@ menukarkan
mermaid
momo
negative
+nipis
+noord
+oseanografi
+otomasi
+outside
+paras
+pdrb
+pelapukan
+pemrogram
+penciuman
diff --git a/tika-eval/src/main/resources/common_tokens/it b/tika-eval/src/main/resources/common_tokens/it
index a585210..b902b9e 100644
--- a/tika-eval/src/main/resources/common_tokens/it
+++ b/tika-eval/src/main/resources/common_tokens/it
@@ -7649,7 +7649,6 @@ fronteggiare
suggerito
preliminari
africani
-head
pianeti
collegare
gigi
@@ -9223,7 +9222,6 @@ capostipite
emile
government
grigia
-lang
bestia
vena
misericordia
@@ -9261,7 +9259,6 @@ siberia
anatomia
paralleli
portavoce
-style
blake
collaterali
domestici
@@ -9472,7 +9469,6 @@ harvey
nilo
settimanali
angelis
-body
branca
arbitro
concorsi
@@ -9719,7 +9715,6 @@ hopkins
ininterrottamente
dinamiche
estere
-title
ascolto
assessore
mission
@@ -12697,7 +12692,6 @@ prolifico
risultavano
secchi
essanay
-html
novi
riconciliazione
tondo
@@ -17625,7 +17619,6 @@ marianna
pendii
poligonale
pretore
-strong
tier
aeronautici
affiancate
@@ -18062,7 +18055,6 @@ johanna
penelope
retrocessi
riproducono
-script
swift
traumi
travagliata
@@ -19186,7 +19178,6 @@ ossido
voltaire
blanche
brigadiere
-colspan
decorativa
distribui
gestiscono
@@ -19637,7 +19628,6 @@ semitappa
sfido
similitudini
sion
-table
teodorico
tomaso
avanzamento
@@ -19659,6 +19649,7 @@ rivendicazione
saltato
usuale
visitazione
+__forcetoc__
atmosferico
benetton
bernstein
@@ -20000,3 +19991,12 @@ alternato
bada
collaudo
cyborg
+difficoltosa
+espanola
+espansioni
+inaspettata
+leve
+lowe
+marciano
+monkey
+ovvio
diff --git a/tika-eval/src/main/resources/common_tokens/ja b/tika-eval/src/main/resources/common_tokens/ja
index 6507a68..426ffc8 100644
--- a/tika-eval/src/main/resources/common_tokens/ja
+++ b/tika-eval/src/main/resources/common_tokens/ja
@@ -14528,7 +14528,6 @@ american
革に
メオ
一角
-colspan
に京
ほろ
愛か
@@ -16075,7 +16074,6 @@ party
秩序
勝つ
game
-style
と運
物体
介て
@@ -16091,7 +16089,6 @@ style
ネキ
趣旨
魚の
-rowspan
て歩
ン開
三丁
@@ -20000,3 +19997,6 @@ gold
る心
テノ
ワて
+積も
+を囲
+本郷
diff --git a/tika-eval/src/main/resources/common_tokens/ko b/tika-eval/src/main/resources/common_tokens/ko
index 084231c..7b15bab 100644
--- a/tika-eval/src/main/resources/common_tokens/ko
+++ b/tika-eval/src/main/resources/common_tokens/ko
@@ -17907,7 +17907,6 @@ that
중남
트기
현준
-html
김문
념사
뉘는
@@ -19870,7 +19869,6 @@ award
현관
혜왕
grand
-style
계연
김의
낮았
@@ -20000,3 +19998,5 @@ final
깎아
나디
댈러
+더십
+덕양
diff --git a/tika-eval/src/main/resources/common_tokens/nl b/tika-eval/src/main/resources/common_tokens/nl
index 07197e9..2eb418e 100644
--- a/tika-eval/src/main/resources/common_tokens/nl
+++ b/tika-eval/src/main/resources/common_tokens/nl
@@ -179,7 +179,6 @@ boven
bleef
noord
verenigde
-lang
soorten
stond
mensen
@@ -6423,7 +6422,6 @@ teruggekeerd
vacht
uitgangspunt
dreigt
-head
momenten
transfer
uitgevonden
@@ -6674,7 +6672,6 @@ gehanteerd
keizerin
kiev
nicole
-script
allegro
neue
dure
@@ -7885,7 +7882,6 @@ moravie
realiseerde
rebel
basisplaats
-body
daalt
federaal
opperbevelhebber
@@ -9574,7 +9570,6 @@ meeting
publiciteit
rhin
smits
-style
vlakken
werf
adrien
@@ -10207,7 +10202,6 @@ aantekeningen
australia
essentie
gedachtegoed
-html
typerend
cambodja
dissertatie
@@ -12583,7 +12577,6 @@ ontginning
rufus
secties
serra
-strong
zongen
artists
command
@@ -17971,7 +17964,6 @@ leonardus
lovende
punctata
rossum
-section
trance
uitgeroeid
vieux
@@ -20000,3 +19992,11 @@ huisorde
ingeslagen
inquisitie
kredietcrisis
+limoges
+lochem
+loenen
+maandenlang
+megan
+namiddag
+priorij
+rondo
diff --git a/tika-eval/src/main/resources/common_tokens/pt b/tika-eval/src/main/resources/common_tokens/pt
index 55b9505..507960b 100644
--- a/tika-eval/src/main/resources/common_tokens/pt
+++ b/tika-eval/src/main/resources/common_tokens/pt
@@ -5479,6 +5479,7 @@ guide
vereadores
armado
angra
+find_ranktoreturn
falhas
fita
ocorridos
@@ -5572,6 +5573,7 @@ convencional
projetada
bell
extremidade
+species_xref
minerais
oracao
ligando
@@ -6502,7 +6504,6 @@ theatre
aproximar
broadway
espectral
-html
denominados
petersburgo
causados
@@ -7749,7 +7750,6 @@ descricoes
estudando
minoria
mundos
-style
doentes
egipcia
potenciais
@@ -8066,7 +8066,6 @@ seda
vinil
comissario
construindo
-head
livrar
parecida
cordeiro
@@ -10482,7 +10481,6 @@ trato
armadilha
desculpas
incorporando
-lang
anda
forno
revestimento
@@ -11165,7 +11163,6 @@ liberar
mozart
vietname
acusada
-body
cessar
compressao
demonstraram
@@ -12782,7 +12779,6 @@ perfect
prize
recopa
sacrificios
-script
solteiro
springs
stories
@@ -15122,7 +15118,6 @@ severino
soluvel
tijolo
bizantinas
-colspan
cursar
educar
eslavos
@@ -15745,7 +15740,6 @@ praticando
reunida
sabido
sterling
-strong
tigres
vulnerabilidade
arenas
@@ -17134,6 +17128,7 @@ morton
september
subdividida
tara
+taxon_name_xref
tornavam
aparencias
atualizados
@@ -19770,7 +19765,6 @@ recanto
ressentimento
shonen
smackdown
-table
tape
wembley
adventista
@@ -20000,3 +19994,9 @@ medicacao
participaria
pendente
penitencia
+prejudiciais
+questionamento
+racista
+rampas
+rectangular
+republicanas
diff --git a/tika-eval/src/main/resources/common_tokens/ru b/tika-eval/src/main/resources/common_tokens/ru
index 82514f2..025c91b 100644
--- a/tika-eval/src/main/resources/common_tokens/ru
+++ b/tika-eval/src/main/resources/common_tokens/ru
@@ -6361,7 +6361,6 @@ house
англииских
китаиского
участница
-html
official
газет
вокзала
@@ -14180,7 +14179,6 @@ business
простирается
пулемет
рязань
-lang
корону
олимпииском
выплаты
@@ -17330,7 +17328,6 @@ forbes
спин
суды
хозяиству
-head
маркграф
представленныи
развлечения
@@ -19177,6 +19174,7 @@ army
раскрывается
сегмента
финансовое
+__toc__
вчера
выкуп
годовщину
@@ -19495,7 +19493,6 @@ ethnologue
своеобразнои
станице
теоремы
-style
бокса
городища
доставили
@@ -20000,3 +19997,6 @@ opera
блоке
инженерныи
ковалев
+нарушениями
+плавник
+платформах
diff --git a/tika-eval/src/main/resources/common_tokens/ur b/tika-eval/src/main/resources/common_tokens/ur
index d59ce24..34bb949 100644
--- a/tika-eval/src/main/resources/common_tokens/ur
+++ b/tika-eval/src/main/resources/common_tokens/ur
@@ -6016,6 +6016,7 @@ ghost
ہالٹ
ہوشيار
ہيٹي
+__toc__
cricket
power
ابھرا
@@ -6228,7 +6229,6 @@ general
class
computer
hilaire
-html
museum
research
ابوالقاسم
@@ -9322,7 +9322,6 @@ main
office
radio
singh
-table
tomb
اباواجداد
ابرار
@@ -10717,7 +10716,6 @@ money
mont
ouen
president
-section
version
ابراهيم
احتجاجات
@@ -11472,7 +11470,6 @@ test
based
battle
being
-body
club
complex
cross
@@ -13911,7 +13908,6 @@ forum
grande
guardian
hamilton
-head
hockey
jewish
justice
@@ -14262,7 +14258,6 @@ results
rockingham
role
serbia
-style
tunisia
ابروريزي
ابوجہل
@@ -15036,7 +15031,6 @@ february
fission
five
foret
-form
front
gervais
hospital
@@ -15699,6 +15693,7 @@ woman
سپردگي
سہارن
شاقہ
+شاہ_عالم_ثاني
شرجيل
شعاري
شفافيت
@@ -16710,6 +16705,7 @@ young
روسايے
روكس
رولنگ
+رياست_حيدراباد
ريسكيو
ريشوں
ريكي
@@ -18233,7 +18229,6 @@ random
range
ranking
routledge
-script
sierra
solution
sourcebl
@@ -18247,7 +18242,6 @@ syndrome
terrorism
texts
tier
-title
tradition
units
until
@@ -19394,6 +19388,7 @@ yonne
محتلف
محرابيں
محرر
+محمد_شاہ
محير
مخبر
مخلصين
@@ -19740,6 +19735,7 @@ weapon
افسس
افٹر
اقليدسي
+اكبر_شاہ_ثاني
اكسيد
اكلان
الابيض
@@ -20000,3 +19996,7 @@ weapon
ريلياں
رينالڈز
ريوں
+رچفيلڈ
+رڈيارڈ
+زاروں
+زايدہ
diff --git a/tika-eval/src/main/resources/common_tokens/vi b/tika-eval/src/main/resources/common_tokens/vi
index 32b47c4..98684b8 100644
--- a/tika-eval/src/main/resources/common_tokens/vi
+++ b/tika-eval/src/main/resources/common_tokens/vi
@@ -92,7 +92,6 @@ sach
chan
cach
mien
-lang
phai
phia
nhien
@@ -206,7 +205,6 @@ nghien
nhanh
rieng
bung
-table
nghiem
thuan
cham
@@ -311,7 +309,6 @@ giet
thon
nuoi
mexico
-body
crambidae
ranh
john
@@ -1162,7 +1159,6 @@ sydney
attelabidae
koch
moore
-html
myanma
nghenh
major
@@ -1867,7 +1863,6 @@ michelin
anton
isaac
record
-style
march
nevada
robin
@@ -1994,7 +1989,6 @@ stuart
yale
debehogne
dennstaedtiaceae
-head
kingdom
mount
bartramiaceae
@@ -3139,7 +3133,6 @@ diana
hadena
mammal
medical
-section
snow
hoffman
kiss
@@ -3573,7 +3566,6 @@ nitida
overview
pocock
satellite
-form
hatt
journey
lopatin
@@ -4150,7 +4142,6 @@ gleicheniaceae
medina
polk
sherman
-title
touch
allison
bethlehem
@@ -4931,7 +4922,6 @@ societe
somerset
spermacoce
stephanie
-strong
technologies
africanus
along
@@ -7084,7 +7074,6 @@ boom
bratislava
chlorophytum
coleophora
-colspan
columnea
cortinariaceae
cybocephalus
@@ -7270,6 +7259,7 @@ mecca
miocen
miwa
obtusa
+oibook_result
pallescens
perm
poems
@@ -7836,7 +7826,6 @@ patellapis
plastic
potamogetonaceae
rudolph
-script
shawn
shorea
smicridea
@@ -8452,7 +8441,6 @@ perrier
rada
railway
rockefeller
-rowspan
sort
specific
theresa
@@ -10872,6 +10860,7 @@ valentina
vehicles
yalta
acuminatus
+al_op
alpinus
asarum
ater
@@ -14360,6 +14349,7 @@ faulkner
fortis
fresno
funet
+gbs_navlinks_s
goddard
granger
grodno
@@ -17025,6 +17015,7 @@ awakening
bacillus
bears
benn
+bic_bromcr
blackpool
botafogo
brasiliense
@@ -17568,7 +17559,6 @@ samir
sekigahara
selene
sera
-span
staffordshire
stations
steinberg
@@ -20000,3 +19990,13 @@ saburo
sagittata
samguk
sanborn
+sanna
+sarikamis
+savsat
+schizachyrium
+schlesinger
+sebinkarahisar
+senshi
+serik
+seriz
+serotina
diff --git a/tika-eval/src/main/resources/common_tokens/zh-cn b/tika-eval/src/main/resources/common_tokens/zh-cn
index 4cb5216..cd97411 100644
--- a/tika-eval/src/main/resources/common_tokens/zh-cn
+++ b/tika-eval/src/main/resources/common_tokens/zh-cn
@@ -17273,7 +17273,6 @@ thomas
让她
造业
都拉
-style
为从
任妻
但都
@@ -20000,3 +19999,4 @@ playstation
为元
为布
了太
+他正
diff --git a/tika-eval/src/main/resources/common_tokens/zh-tw b/tika-eval/src/main/resources/common_tokens/zh-tw
index 6f85f5c..23be28f 100644
--- a/tika-eval/src/main/resources/common_tokens/zh-tw
+++ b/tika-eval/src/main/resources/common_tokens/zh-tw
@@ -17267,7 +17267,6 @@ thomas
让她
造业
都拉
-style
为从
任妻
但都
@@ -20000,3 +19999,4 @@ playstation
是國
有改
有证
+树属