You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/30 18:05:32 UTC

[tika] branch master updated: TIKA-2822 -- remove common >=4 letter html markup entities

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 8c22f05  TIKA-2822 -- remove common >=4 letter html markup entities
8c22f05 is described below

commit 8c22f054ea94526e6d22a3f4c923e0b8724f2831
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jan 30 13:05:09 2019 -0500

    TIKA-2822 -- remove common >=4 letter html markup entities
---
 .../tika/eval/tools/TopCommonTokenCounter.java     | 30 +++++++++++++++++++++-
 tika-eval/src/main/resources/common_tokens/ar      |  4 +--
 tika-eval/src/main/resources/common_tokens/bn      | 18 ++++++-------
 tika-eval/src/main/resources/common_tokens/de      | 18 ++++++-------
 tika-eval/src/main/resources/common_tokens/el      | 18 ++++++-------
 tika-eval/src/main/resources/common_tokens/en      | 26 +++++++++----------
 tika-eval/src/main/resources/common_tokens/es      | 14 +++++-----
 tika-eval/src/main/resources/common_tokens/fa      | 16 ++++++------
 tika-eval/src/main/resources/common_tokens/fr      | 22 ++++++++--------
 tika-eval/src/main/resources/common_tokens/he      |  6 ++---
 tika-eval/src/main/resources/common_tokens/hi      | 24 ++++++++---------
 tika-eval/src/main/resources/common_tokens/id      | 22 ++++++++--------
 tika-eval/src/main/resources/common_tokens/it      | 20 +++++++--------
 tika-eval/src/main/resources/common_tokens/ja      |  6 ++---
 tika-eval/src/main/resources/common_tokens/ko      |  4 +--
 tika-eval/src/main/resources/common_tokens/nl      | 16 ++++++------
 tika-eval/src/main/resources/common_tokens/pt      | 18 ++++++-------
 tika-eval/src/main/resources/common_tokens/ru      |  8 +++---
 tika-eval/src/main/resources/common_tokens/ur      | 18 ++++++-------
 tika-eval/src/main/resources/common_tokens/vi      | 28 ++++++++++----------
 tika-eval/src/main/resources/common_tokens/zh-cn   |  2 +-
 tika-eval/src/main/resources/common_tokens/zh-tw   |  2 +-
 22 files changed, 184 insertions(+), 156 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
index 5fdd7e8..4d76b7c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
@@ -56,6 +56,9 @@ import org.apache.tika.eval.tokens.AnalyzerManager;
  *
  * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
  * but includes bigrams for cjk.
+ *
+ * It also has a white list for __email__ and __url__ and a black list
+ * for common html markup terms.
  */
 public class TopCommonTokenCounter {
     private static final String FIELD = "f";
@@ -69,6 +72,31 @@ public class TopCommonTokenCounter {
             }
     ));
 
+    //words to ignore
+    //these are common 4 letter html markup words that we do
+    //not want to count in case of failed markup processing.
+    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
+    static Set<String> BLACK_LIST = new HashSet<>(Arrays.asList(
+            "span",
+            "table",
+            "href",
+            "head",
+            "title",
+            "body",
+            "html",
+            "tagname",
+            "lang",
+            "style",
+            "script",
+            "strong",
+            "blockquote",
+            "form",
+            "iframe",
+            "section",
+            "colspan",
+            "rowspan"
+    ));
+
     public static void main(String[] args) throws Exception {
         Path inputFile = Paths.get(args[0]);
         Path commonTokensFile = Paths.get(args[1]);
@@ -127,7 +155,7 @@ public class TopCommonTokenCounter {
                     if (queue.top() == null || queue.size() < TOP_N ||
                             df >= queue.top().df) {
                         String t = bytesRef.utf8ToString();
-                        if (! WHITE_LIST.contains(t)) {
+                        if (! WHITE_LIST.contains(t) && ! BLACK_LIST.contains(t)) {
                             queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                         }
 
diff --git a/tika-eval/src/main/resources/common_tokens/ar b/tika-eval/src/main/resources/common_tokens/ar
index c99bf24..5a9a6c4 100644
--- a/tika-eval/src/main/resources/common_tokens/ar
+++ b/tika-eval/src/main/resources/common_tokens/ar
@@ -6975,7 +6975,6 @@ center
 سمات
 لوحظ
 مسافات
-html
 الحافظ
 اواسط
 اياه
@@ -9605,7 +9604,6 @@ school
 عينيه
 مرجع
 ولاد
-style
 الابنه
 الامني
 الانخفاض
@@ -20000,3 +19998,5 @@ california
 شنقا
 طرفه
 فانتقل
+قابوس
+لاسبانيا
diff --git a/tika-eval/src/main/resources/common_tokens/bn b/tika-eval/src/main/resources/common_tokens/bn
index 64558be..950d7e5 100644
--- a/tika-eval/src/main/resources/common_tokens/bn
+++ b/tika-eval/src/main/resources/common_tokens/bn
@@ -3635,7 +3635,6 @@ film
 রাননা
 রোডে
 সংসদে
-html
 অনুরোধে
 ডানার
 পাযনি
@@ -9035,7 +9034,6 @@ editions
 illustrated
 kingdom
 period
-section
 অবমুকত
 অভযনতরের
 আওতাভুকত
@@ -10146,7 +10144,6 @@ germany
 awards
 editor
 sanskrit
-title
 অনযভাবে
 অবহেলিত
 অসফল
@@ -10282,7 +10279,6 @@ where
 সালতানাত
 সুষম
 হতাশা
-body
 standard
 অনিবারয
 অবদি
@@ -11109,7 +11105,6 @@ technical
 cinema
 class
 eastern
-form
 sites
 translations
 অধযযনকালীন
@@ -11208,7 +11203,6 @@ independent
 past
 services
 song
-style
 well
 whois
 wiki
@@ -11926,7 +11920,6 @@ archived
 known
 lives
 roerich
-table
 অঞজন
 অবযহতি
 অবিশবাসয
@@ -14533,7 +14526,6 @@ disease
 educational
 function
 geographic
-head
 johnson
 leiden
 master
@@ -17718,7 +17710,6 @@ hand
 highest
 http
 java
-lang
 legal
 market
 metropolitan
@@ -20000,3 +19991,12 @@ wars
 কযামবরিজের
 করমদকষতা
 করমীকে
+কলাকৌশল
+কলাসিকাল
+কলোরিন
+কারযপদধতি
+কুচবিহার
+কুলসুম
+কুশিযারা
+কৃষণনগরের
+কেরিযার
diff --git a/tika-eval/src/main/resources/common_tokens/de b/tika-eval/src/main/resources/common_tokens/de
index a163e83..517e5cc 100644
--- a/tika-eval/src/main/resources/common_tokens/de
+++ b/tika-eval/src/main/resources/common_tokens/de
@@ -228,7 +228,6 @@ soll
 land
 sollte
 darauf
-lang
 ubernahm
 lage
 weltkrieg
@@ -239,7 +238,6 @@ innerhalb
 davon
 fuhrt
 frankreich
-form
 quellen
 frau
 gruppe
@@ -3124,7 +3122,6 @@ konzert
 trugen
 last
 norbert
-html
 kunden
 sekunden
 stiess
@@ -6701,7 +6698,6 @@ camp
 komplexen
 bilanz
 lokomotiven
-head
 bindung
 pays
 tisch
@@ -8235,6 +8231,7 @@ landsmann
 moral
 abfahrt
 geraden
+__toc__
 poesie
 beliebten
 adelsfamilie
@@ -9026,7 +9023,6 @@ absoluten
 magic
 eisenbahnen
 ereignete
-body
 giorgio
 grabmal
 verwandelt
@@ -9988,6 +9984,7 @@ telefon
 fehlten
 hallen
 verkehrten
+__noeditsection__
 empfanger
 mosel
 geschutztes
@@ -10326,7 +10323,6 @@ railroad
 universidad
 else
 sozialdemokraten
-style
 utah
 bauingenieur
 gebrauchliche
@@ -12813,7 +12809,6 @@ geometrischen
 reuter
 libyen
 reportage
-section
 archivs
 autounfall
 energien
@@ -12921,7 +12916,6 @@ ersichtlich
 iwanowitsch
 klee
 local
-span
 ruder
 aufstiegsrunde
 stadtbefestigung
@@ -16585,6 +16579,7 @@ verwirrung
 gernot
 handelns
 nutzlich
+oibook_result
 sudteil
 wasserstand
 visuellen
@@ -17220,7 +17215,6 @@ dramatisch
 irgendwo
 popularsten
 schulung
-table
 zubereitung
 belassen
 eichenlaub
@@ -20000,3 +19994,9 @@ gepfarrt
 individuums
 phrase
 priestern
+vorstellte
+witte
+wolga
+lampe
+matrosen
+niedersachsens
diff --git a/tika-eval/src/main/resources/common_tokens/el b/tika-eval/src/main/resources/common_tokens/el
index 4124796..05a8af1 100644
--- a/tika-eval/src/main/resources/common_tokens/el
+++ b/tika-eval/src/main/resources/common_tokens/el
@@ -4335,7 +4335,6 @@ smith
 απαγορευση
 πηγων
 χριστιανων
-html
 αρχηγου
 βραζιλιασ
 δημιουργουνται
@@ -6710,7 +6709,6 @@ frederick
 μιση
 πεποιθησεισ
 ρωμαικου
-style
 εξωτερικησ
 θυσια
 μακεδονικου
@@ -7907,7 +7905,6 @@ political
 χαριν
 council
 jacques
-lang
 αθηναιουσ
 ασκειται
 διεξαγει
@@ -10153,7 +10150,6 @@ department
 φιλοδοξιεσ
 χαοσ
 domain
-head
 literature
 αναλογωσ
 θερετρο
@@ -11114,7 +11110,6 @@ republic
 σωζομενα
 τσεχοσλοβακιασ
 φυλασσονται
-colspan
 never
 penguin
 γεωμετρικη
@@ -13799,7 +13794,6 @@ army
 birds
 ferdinand
 mario
-table
 αλγεβρα
 ανεβαινουν
 απογοητευμενοσ
@@ -13959,7 +13953,6 @@ ultimate
 συμφωνιων
 υπογραφτηκε
 χωρισμο
-body
 heritage
 springer
 ακτινασ
@@ -16350,7 +16343,6 @@ bulletin
 estadio
 etudes
 look
-title
 village
 αθλητικουσ
 αλικησ
@@ -18645,7 +18637,6 @@ demo
 gregory
 hamlyn
 klaus
-section
 temple
 wisconsin
 αβεβαιοτητα
@@ -20000,3 +19991,12 @@ snow
 κοκκινησ
 λορεντσο
 μαξιμιλιανου
+μαχαιρια
+μεταβληθηκε
+μεταγραφεσ
+μεταλλουργια
+μπαλκονι
+μπερκ
+ναυπηγηση
+νευτωνα
+οικοδομικη
diff --git a/tika-eval/src/main/resources/common_tokens/en b/tika-eval/src/main/resources/common_tokens/en
index 04ce8d2..f1cb892 100644
--- a/tika-eval/src/main/resources/common_tokens/en
+++ b/tika-eval/src/main/resources/common_tokens/en
@@ -192,7 +192,6 @@ system
 become
 different
 given
-form
 went
 established
 came
@@ -220,7 +219,6 @@ play
 currently
 period
 originally
-head
 president
 young
 track
@@ -233,7 +231,6 @@ works
 once
 joined
 using
-title
 river
 making
 community
@@ -388,7 +385,6 @@ release
 radio
 research
 previously
-style
 night
 help
 remained
@@ -436,7 +432,6 @@ round
 charles
 designed
 court
-body
 province
 largest
 michael
@@ -552,7 +547,6 @@ spent
 peter
 finally
 introduced
-strong
 running
 nine
 complete
@@ -593,7 +587,6 @@ native
 blue
 recent
 administrative
-section
 biography
 culture
 plays
@@ -1211,7 +1204,6 @@ democratic
 reason
 francisco
 kind
-table
 growth
 hour
 territory
@@ -3191,7 +3183,6 @@ quick
 sing
 influences
 louisiana
-script
 fighter
 hungarian
 recipient
@@ -4291,7 +4282,6 @@ palm
 associates
 finnish
 diplomatic
-span
 innovation
 random
 vegas
@@ -7234,7 +7224,6 @@ stealing
 uganda
 satisfied
 confrontation
-lang
 trailer
 flooded
 loans
@@ -11523,7 +11512,6 @@ illuminated
 rockefeller
 surgeons
 hydroelectric
-colspan
 rusty
 wolverhampton
 balkans
@@ -13334,7 +13322,6 @@ cradle
 engraver
 espana
 fanny
-html
 magnum
 cathy
 classify
@@ -13796,6 +13783,7 @@ ardent
 beautifully
 befriends
 nana
+__toc__
 atoll
 ballistic
 collier
@@ -20000,3 +19988,15 @@ sore
 whitfield
 adventist
 breaches
+councilman
+cristo
+extremist
+fitch
+fondness
+madeline
+neuronal
+scramble
+sigismund
+undermining
+wladyslaw
+bette
diff --git a/tika-eval/src/main/resources/common_tokens/es b/tika-eval/src/main/resources/common_tokens/es
index af172da..f707d94 100644
--- a/tika-eval/src/main/resources/common_tokens/es
+++ b/tika-eval/src/main/resources/common_tokens/es
@@ -7106,7 +7106,6 @@ sensible
 haciendose
 cubren
 asistencias
-html
 plus
 afirmar
 disenar
@@ -8504,7 +8503,6 @@ rechazar
 arkansas
 women
 demarcacion
-head
 sujeta
 chiapas
 fanaticos
@@ -8981,7 +8979,6 @@ march
 exigencias
 lozano
 colega
-style
 hibrido
 muller
 lanzando
@@ -10273,7 +10270,6 @@ confirmacion
 friends
 peregrinos
 prometido
-lang
 lomas
 funk
 invadir
@@ -12702,7 +12698,6 @@ inmigrante
 mango
 orquidea
 alternas
-body
 mohamed
 mudejar
 analysis
@@ -16715,7 +16710,6 @@ pumas
 quieres
 serbios
 solteros
-table
 adscrita
 cronologicamente
 entregan
@@ -17465,7 +17459,6 @@ doblado
 mendizabal
 resiste
 simetrica
-strong
 superviso
 timido
 tributarios
@@ -19518,6 +19511,7 @@ dinamicos
 entendia
 finn
 maiden
+oibook_result
 regido
 sanguinea
 saturnino
@@ -20000,3 +19994,9 @@ programar
 reducciones
 sucesivo
 suroriental
+vocablos
+badalona
+cilindricas
+coordinada
+cunada
+elio
diff --git a/tika-eval/src/main/resources/common_tokens/fa b/tika-eval/src/main/resources/common_tokens/fa
index 6cedfdc..f63d3bc 100644
--- a/tika-eval/src/main/resources/common_tokens/fa
+++ b/tika-eval/src/main/resources/common_tokens/fa
@@ -4949,7 +4949,6 @@ gmina
 الهيات
 كارامد
 مورگان
-html
 خودروسازي
 ماهواره
 مغولستان
@@ -11097,7 +11096,6 @@ report
 هرمي
 همانا
 پايينتري
-lang
 ابرانساني
 الزمان
 بردارند
@@ -13986,7 +13984,6 @@ pictures
 carolina
 death
 golden
-section
 اتفاقا
 اروپاست
 ازدحام
@@ -14950,7 +14947,7 @@ while
 گراديان
 گنبدهاي
 گولد
-head
+__toc__
 heart
 party
 principles
@@ -15993,11 +15990,8 @@ organization
 گدانسك
 گيريهاي
 advanced
-body
 england
-form
 nova
-table
 احترامي
 ارايهاي
 اشتها
@@ -18152,7 +18146,6 @@ catalog
 florida
 role
 selected
-style
 tehran
 than
 ابوتراب
@@ -20000,3 +19993,10 @@ provincia
 نازكتر
 ناكارامدي
 نانا
+نهادهايي
+نيمرخ
+هابيت
+هاليفاكس
+هدفشان
+همجوشي
+ولپي
diff --git a/tika-eval/src/main/resources/common_tokens/fr b/tika-eval/src/main/resources/common_tokens/fr
index 43a8f7f..216eaca 100644
--- a/tika-eval/src/main/resources/common_tokens/fr
+++ b/tika-eval/src/main/resources/common_tokens/fr
@@ -461,7 +461,6 @@ superieur
 gauche
 voit
 suisse
-style
 epoque
 district
 journal
@@ -710,7 +709,6 @@ limite
 russe
 comite
 nature
-section
 elections
 central
 laisse
@@ -1720,7 +1718,6 @@ eugene
 regles
 completement
 remonte
-table
 fabrication
 salon
 orientale
@@ -4588,7 +4585,6 @@ gravure
 yougoslavie
 ligues
 couts
-lang
 athenes
 souris
 gravement
@@ -6701,7 +6697,6 @@ wagner
 yaounde
 kilometrique
 pension
-html
 brothers
 beaumont
 effort
@@ -7642,7 +7637,6 @@ entrainement
 commander
 recueille
 cyril
-head
 citadelle
 divin
 roussillon
@@ -9490,7 +9484,6 @@ mickey
 nettoyage
 search
 audiences
-script
 sexes
 martha
 corne
@@ -10921,7 +10914,6 @@ decrocher
 mutuelle
 projections
 thiers
-body
 excessive
 romanes
 dejeuner
@@ -13070,7 +13062,6 @@ prefets
 therapie
 chantent
 posture
-rowspan
 sillage
 assemblage
 barque
@@ -15394,7 +15385,6 @@ seguin
 sinclair
 traduisent
 vieillard
-colspan
 continentaux
 corrigee
 cynthia
@@ -16316,7 +16306,6 @@ forgeron
 mickael
 presentaient
 pretent
-strong
 subtropicales
 trancher
 binomes
@@ -18166,6 +18155,7 @@ cornet
 creerent
 denombrait
 gracie
+oibook_result
 reproduisent
 rosette
 beast
@@ -20000,3 +19990,13 @@ agresse
 bovine
 cantique
 decime
+dimes
+itinerant
+laiton
+oued
+prisee
+voutees
+borg
+brusque
+camillo
+finalise
diff --git a/tika-eval/src/main/resources/common_tokens/he b/tika-eval/src/main/resources/common_tokens/he
index d0f9acc..d3e84e3 100644
--- a/tika-eval/src/main/resources/common_tokens/he
+++ b/tika-eval/src/main/resources/common_tokens/he
@@ -1703,7 +1703,6 @@ world
 ישיבה
 נוסד
 פוליטי
-html
 אליהו
 בריטים
 חומרים
@@ -19120,8 +19119,6 @@ pages
 שירש
 שסברו
 תיאורו
-head
-lang
 mountain
 אבקת
 אושפז
@@ -19499,6 +19496,7 @@ archive
 שיין
 שניהלו
 שפיתחה
+__toc__
 בילו
 בסיסה
 ברנט
@@ -20000,3 +19998,5 @@ make
 מחירו
 מירה
 מתוארכת
+פירוקה
+שאורך
diff --git a/tika-eval/src/main/resources/common_tokens/hi b/tika-eval/src/main/resources/common_tokens/hi
index bc3aacd..ed7d28c 100644
--- a/tika-eval/src/main/resources/common_tokens/hi
+++ b/tika-eval/src/main/resources/common_tokens/hi
@@ -3831,7 +3831,6 @@ science
 समारोहों
 सवाधीनता
 सॄषटिकरता
-html
 अपरतयकष
 अरचना
 गोविनद
@@ -7718,7 +7717,6 @@ works
 हरमन
 human
 open
-style
 अधिकांशत
 अभिभावक
 अशलील
@@ -9449,10 +9447,8 @@ magazine
 सातवाँ
 सौतेली
 basic
-colspan
 corporation
 electric
-rowspan
 union
 western
 अजरबैजान
@@ -10640,7 +10636,6 @@ using
 हंटर
 हिमपात
 हैलोजनीकृत
-body
 light
 method
 अकषुणण
@@ -11108,8 +11103,6 @@ height
 lake
 last
 learning
-section
-table
 uefa
 अंडों
 अखणड
@@ -12462,7 +12455,6 @@ chromosome
 class
 cross
 disease
-form
 अकेडमी
 अदालती
 अभिगरहण
@@ -13575,6 +13567,7 @@ very
 वारमिंग
 विकरण
 वितरकों
+विदया_डॉट_कॉम
 विनोबा
 शरोणि
 शारीर
@@ -15347,7 +15340,6 @@ daily
 equation
 help
 hlen
-lang
 operation
 personal
 photographs
@@ -15535,7 +15527,6 @@ performance
 season
 square
 stanford
-title
 tourism
 yoga
 अंतरीप
@@ -15925,7 +15916,6 @@ conference
 december
 discussion
 empire
-head
 lecture
 maria
 organisation
@@ -18230,6 +18220,7 @@ court
 glass
 hand
 ieee
+oibook_result
 professional
 smith
 unesco
@@ -19512,7 +19503,6 @@ object
 prime
 resolution
 rules
-script
 seven
 simulation
 status
@@ -20000,3 +19990,13 @@ wild
 पैरोकार
 पॉपुलर
 पोखर
+फंसी
+फरहा
+फलता
+फलेवर
+फुलर
+फेंककर
+फेमिना
+फोरथ
+बंसी
+बढाये
diff --git a/tika-eval/src/main/resources/common_tokens/id b/tika-eval/src/main/resources/common_tokens/id
index ce7470d..e2981e6 100644
--- a/tika-eval/src/main/resources/common_tokens/id
+++ b/tika-eval/src/main/resources/common_tokens/id
@@ -3163,7 +3163,6 @@ obatan
 pengukuran
 samudera
 berakhirnya
-html
 kemanusiaan
 ketimbang
 teratas
@@ -4913,7 +4912,6 @@ areal
 bermotor
 desainer
 field
-head
 logistik
 masak
 napoleon
@@ -5427,7 +5425,6 @@ membubarkan
 myanmar
 pengantin
 administrator
-body
 bundar
 eksplisit
 pelawak
@@ -5457,7 +5454,6 @@ disponsori
 henri
 keraton
 stanley
-style
 teritori
 terlestarikan
 convention
@@ -7218,7 +7214,6 @@ creative
 dibentuklah
 dipertanyakan
 hello
-lang
 menugaskan
 mesopotamia
 penembakan
@@ -7439,7 +7434,6 @@ monica
 penghujung
 pengkajian
 perbedaannya
-section
 tercermin
 yudaisme
 diasosiasikan
@@ -8636,7 +8630,6 @@ pematang
 schmidt
 sekarat
 sophia
-table
 tanaka
 transliterasi
 watanabe
@@ -9725,7 +9718,6 @@ berjaya
 berkuliah
 diisolasi
 episodenya
-form
 gloria
 haiti
 ketaatan
@@ -10143,7 +10135,6 @@ sanitasi
 sekitaran
 sosiolog
 spektakuler
-title
 ying
 buntu
 carol
@@ -11997,9 +11988,7 @@ rebecca
 rhodes
 rsud
 sarawak
-script
 simetri
-strong
 talmud
 terompet
 timotius
@@ -19345,6 +19334,7 @@ twelve
 ulet
 veneto
 wesel
+__indeks__
 acarnidae
 anjingnya
 antoinette
@@ -20000,3 +19990,13 @@ menukarkan
 mermaid
 momo
 negative
+nipis
+noord
+oseanografi
+otomasi
+outside
+paras
+pdrb
+pelapukan
+pemrogram
+penciuman
diff --git a/tika-eval/src/main/resources/common_tokens/it b/tika-eval/src/main/resources/common_tokens/it
index a585210..b902b9e 100644
--- a/tika-eval/src/main/resources/common_tokens/it
+++ b/tika-eval/src/main/resources/common_tokens/it
@@ -7649,7 +7649,6 @@ fronteggiare
 suggerito
 preliminari
 africani
-head
 pianeti
 collegare
 gigi
@@ -9223,7 +9222,6 @@ capostipite
 emile
 government
 grigia
-lang
 bestia
 vena
 misericordia
@@ -9261,7 +9259,6 @@ siberia
 anatomia
 paralleli
 portavoce
-style
 blake
 collaterali
 domestici
@@ -9472,7 +9469,6 @@ harvey
 nilo
 settimanali
 angelis
-body
 branca
 arbitro
 concorsi
@@ -9719,7 +9715,6 @@ hopkins
 ininterrottamente
 dinamiche
 estere
-title
 ascolto
 assessore
 mission
@@ -12697,7 +12692,6 @@ prolifico
 risultavano
 secchi
 essanay
-html
 novi
 riconciliazione
 tondo
@@ -17625,7 +17619,6 @@ marianna
 pendii
 poligonale
 pretore
-strong
 tier
 aeronautici
 affiancate
@@ -18062,7 +18055,6 @@ johanna
 penelope
 retrocessi
 riproducono
-script
 swift
 traumi
 travagliata
@@ -19186,7 +19178,6 @@ ossido
 voltaire
 blanche
 brigadiere
-colspan
 decorativa
 distribui
 gestiscono
@@ -19637,7 +19628,6 @@ semitappa
 sfido
 similitudini
 sion
-table
 teodorico
 tomaso
 avanzamento
@@ -19659,6 +19649,7 @@ rivendicazione
 saltato
 usuale
 visitazione
+__forcetoc__
 atmosferico
 benetton
 bernstein
@@ -20000,3 +19991,12 @@ alternato
 bada
 collaudo
 cyborg
+difficoltosa
+espanola
+espansioni
+inaspettata
+leve
+lowe
+marciano
+monkey
+ovvio
diff --git a/tika-eval/src/main/resources/common_tokens/ja b/tika-eval/src/main/resources/common_tokens/ja
index 6507a68..426ffc8 100644
--- a/tika-eval/src/main/resources/common_tokens/ja
+++ b/tika-eval/src/main/resources/common_tokens/ja
@@ -14528,7 +14528,6 @@ american
 革に
 メオ
 一角
-colspan
 に京
 ほろ
 愛か
@@ -16075,7 +16074,6 @@ party
 秩序
 勝つ
 game
-style
 と運
 物体
 介て
@@ -16091,7 +16089,6 @@ style
 ネキ
 趣旨
 魚の
-rowspan
 て歩
 ン開
 三丁
@@ -20000,3 +19997,6 @@ gold
 る心
 テノ
 ワて
+積も
+を囲
+本郷
diff --git a/tika-eval/src/main/resources/common_tokens/ko b/tika-eval/src/main/resources/common_tokens/ko
index 084231c..7b15bab 100644
--- a/tika-eval/src/main/resources/common_tokens/ko
+++ b/tika-eval/src/main/resources/common_tokens/ko
@@ -17907,7 +17907,6 @@ that
 중남
 트기
 현준
-html
 김문
 념사
 뉘는
@@ -19870,7 +19869,6 @@ award
 현관
 혜왕
 grand
-style
 계연
 김의
 낮았
@@ -20000,3 +19998,5 @@ final
 깎아
 나디
 댈러
+더십
+덕양
diff --git a/tika-eval/src/main/resources/common_tokens/nl b/tika-eval/src/main/resources/common_tokens/nl
index 07197e9..2eb418e 100644
--- a/tika-eval/src/main/resources/common_tokens/nl
+++ b/tika-eval/src/main/resources/common_tokens/nl
@@ -179,7 +179,6 @@ boven
 bleef
 noord
 verenigde
-lang
 soorten
 stond
 mensen
@@ -6423,7 +6422,6 @@ teruggekeerd
 vacht
 uitgangspunt
 dreigt
-head
 momenten
 transfer
 uitgevonden
@@ -6674,7 +6672,6 @@ gehanteerd
 keizerin
 kiev
 nicole
-script
 allegro
 neue
 dure
@@ -7885,7 +7882,6 @@ moravie
 realiseerde
 rebel
 basisplaats
-body
 daalt
 federaal
 opperbevelhebber
@@ -9574,7 +9570,6 @@ meeting
 publiciteit
 rhin
 smits
-style
 vlakken
 werf
 adrien
@@ -10207,7 +10202,6 @@ aantekeningen
 australia
 essentie
 gedachtegoed
-html
 typerend
 cambodja
 dissertatie
@@ -12583,7 +12577,6 @@ ontginning
 rufus
 secties
 serra
-strong
 zongen
 artists
 command
@@ -17971,7 +17964,6 @@ leonardus
 lovende
 punctata
 rossum
-section
 trance
 uitgeroeid
 vieux
@@ -20000,3 +19992,11 @@ huisorde
 ingeslagen
 inquisitie
 kredietcrisis
+limoges
+lochem
+loenen
+maandenlang
+megan
+namiddag
+priorij
+rondo
diff --git a/tika-eval/src/main/resources/common_tokens/pt b/tika-eval/src/main/resources/common_tokens/pt
index 55b9505..507960b 100644
--- a/tika-eval/src/main/resources/common_tokens/pt
+++ b/tika-eval/src/main/resources/common_tokens/pt
@@ -5479,6 +5479,7 @@ guide
 vereadores
 armado
 angra
+find_ranktoreturn
 falhas
 fita
 ocorridos
@@ -5572,6 +5573,7 @@ convencional
 projetada
 bell
 extremidade
+species_xref
 minerais
 oracao
 ligando
@@ -6502,7 +6504,6 @@ theatre
 aproximar
 broadway
 espectral
-html
 denominados
 petersburgo
 causados
@@ -7749,7 +7750,6 @@ descricoes
 estudando
 minoria
 mundos
-style
 doentes
 egipcia
 potenciais
@@ -8066,7 +8066,6 @@ seda
 vinil
 comissario
 construindo
-head
 livrar
 parecida
 cordeiro
@@ -10482,7 +10481,6 @@ trato
 armadilha
 desculpas
 incorporando
-lang
 anda
 forno
 revestimento
@@ -11165,7 +11163,6 @@ liberar
 mozart
 vietname
 acusada
-body
 cessar
 compressao
 demonstraram
@@ -12782,7 +12779,6 @@ perfect
 prize
 recopa
 sacrificios
-script
 solteiro
 springs
 stories
@@ -15122,7 +15118,6 @@ severino
 soluvel
 tijolo
 bizantinas
-colspan
 cursar
 educar
 eslavos
@@ -15745,7 +15740,6 @@ praticando
 reunida
 sabido
 sterling
-strong
 tigres
 vulnerabilidade
 arenas
@@ -17134,6 +17128,7 @@ morton
 september
 subdividida
 tara
+taxon_name_xref
 tornavam
 aparencias
 atualizados
@@ -19770,7 +19765,6 @@ recanto
 ressentimento
 shonen
 smackdown
-table
 tape
 wembley
 adventista
@@ -20000,3 +19994,9 @@ medicacao
 participaria
 pendente
 penitencia
+prejudiciais
+questionamento
+racista
+rampas
+rectangular
+republicanas
diff --git a/tika-eval/src/main/resources/common_tokens/ru b/tika-eval/src/main/resources/common_tokens/ru
index 82514f2..025c91b 100644
--- a/tika-eval/src/main/resources/common_tokens/ru
+++ b/tika-eval/src/main/resources/common_tokens/ru
@@ -6361,7 +6361,6 @@ house
 англииских
 китаиского
 участница
-html
 official
 газет
 вокзала
@@ -14180,7 +14179,6 @@ business
 простирается
 пулемет
 рязань
-lang
 корону
 олимпииском
 выплаты
@@ -17330,7 +17328,6 @@ forbes
 спин
 суды
 хозяиству
-head
 маркграф
 представленныи
 развлечения
@@ -19177,6 +19174,7 @@ army
 раскрывается
 сегмента
 финансовое
+__toc__
 вчера
 выкуп
 годовщину
@@ -19495,7 +19493,6 @@ ethnologue
 своеобразнои
 станице
 теоремы
-style
 бокса
 городища
 доставили
@@ -20000,3 +19997,6 @@ opera
 блоке
 инженерныи
 ковалев
+нарушениями
+плавник
+платформах
diff --git a/tika-eval/src/main/resources/common_tokens/ur b/tika-eval/src/main/resources/common_tokens/ur
index d59ce24..34bb949 100644
--- a/tika-eval/src/main/resources/common_tokens/ur
+++ b/tika-eval/src/main/resources/common_tokens/ur
@@ -6016,6 +6016,7 @@ ghost
 ہالٹ
 ہوشيار
 ہيٹي
+__toc__
 cricket
 power
 ابھرا
@@ -6228,7 +6229,6 @@ general
 class
 computer
 hilaire
-html
 museum
 research
 ابوالقاسم
@@ -9322,7 +9322,6 @@ main
 office
 radio
 singh
-table
 tomb
 اباواجداد
 ابرار
@@ -10717,7 +10716,6 @@ money
 mont
 ouen
 president
-section
 version
 ابراهيم
 احتجاجات
@@ -11472,7 +11470,6 @@ test
 based
 battle
 being
-body
 club
 complex
 cross
@@ -13911,7 +13908,6 @@ forum
 grande
 guardian
 hamilton
-head
 hockey
 jewish
 justice
@@ -14262,7 +14258,6 @@ results
 rockingham
 role
 serbia
-style
 tunisia
 ابروريزي
 ابوجہل
@@ -15036,7 +15031,6 @@ february
 fission
 five
 foret
-form
 front
 gervais
 hospital
@@ -15699,6 +15693,7 @@ woman
 سپردگي
 سہارن
 شاقہ
+شاہ_عالم_ثاني
 شرجيل
 شعاري
 شفافيت
@@ -16710,6 +16705,7 @@ young
 روسايے
 روكس
 رولنگ
+رياست_حيدراباد
 ريسكيو
 ريشوں
 ريكي
@@ -18233,7 +18229,6 @@ random
 range
 ranking
 routledge
-script
 sierra
 solution
 sourcebl
@@ -18247,7 +18242,6 @@ syndrome
 terrorism
 texts
 tier
-title
 tradition
 units
 until
@@ -19394,6 +19388,7 @@ yonne
 محتلف
 محرابيں
 محرر
+محمد_شاہ
 محير
 مخبر
 مخلصين
@@ -19740,6 +19735,7 @@ weapon
 افسس
 افٹر
 اقليدسي
+اكبر_شاہ_ثاني
 اكسيد
 اكلان
 الابيض
@@ -20000,3 +19996,7 @@ weapon
 ريلياں
 رينالڈز
 ريوں
+رچفيلڈ
+رڈيارڈ
+زاروں
+زايدہ
diff --git a/tika-eval/src/main/resources/common_tokens/vi b/tika-eval/src/main/resources/common_tokens/vi
index 32b47c4..98684b8 100644
--- a/tika-eval/src/main/resources/common_tokens/vi
+++ b/tika-eval/src/main/resources/common_tokens/vi
@@ -92,7 +92,6 @@ sach
 chan
 cach
 mien
-lang
 phai
 phia
 nhien
@@ -206,7 +205,6 @@ nghien
 nhanh
 rieng
 bung
-table
 nghiem
 thuan
 cham
@@ -311,7 +309,6 @@ giet
 thon
 nuoi
 mexico
-body
 crambidae
 ranh
 john
@@ -1162,7 +1159,6 @@ sydney
 attelabidae
 koch
 moore
-html
 myanma
 nghenh
 major
@@ -1867,7 +1863,6 @@ michelin
 anton
 isaac
 record
-style
 march
 nevada
 robin
@@ -1994,7 +1989,6 @@ stuart
 yale
 debehogne
 dennstaedtiaceae
-head
 kingdom
 mount
 bartramiaceae
@@ -3139,7 +3133,6 @@ diana
 hadena
 mammal
 medical
-section
 snow
 hoffman
 kiss
@@ -3573,7 +3566,6 @@ nitida
 overview
 pocock
 satellite
-form
 hatt
 journey
 lopatin
@@ -4150,7 +4142,6 @@ gleicheniaceae
 medina
 polk
 sherman
-title
 touch
 allison
 bethlehem
@@ -4931,7 +4922,6 @@ societe
 somerset
 spermacoce
 stephanie
-strong
 technologies
 africanus
 along
@@ -7084,7 +7074,6 @@ boom
 bratislava
 chlorophytum
 coleophora
-colspan
 columnea
 cortinariaceae
 cybocephalus
@@ -7270,6 +7259,7 @@ mecca
 miocen
 miwa
 obtusa
+oibook_result
 pallescens
 perm
 poems
@@ -7836,7 +7826,6 @@ patellapis
 plastic
 potamogetonaceae
 rudolph
-script
 shawn
 shorea
 smicridea
@@ -8452,7 +8441,6 @@ perrier
 rada
 railway
 rockefeller
-rowspan
 sort
 specific
 theresa
@@ -10872,6 +10860,7 @@ valentina
 vehicles
 yalta
 acuminatus
+al_op
 alpinus
 asarum
 ater
@@ -14360,6 +14349,7 @@ faulkner
 fortis
 fresno
 funet
+gbs_navlinks_s
 goddard
 granger
 grodno
@@ -17025,6 +17015,7 @@ awakening
 bacillus
 bears
 benn
+bic_bromcr
 blackpool
 botafogo
 brasiliense
@@ -17568,7 +17559,6 @@ samir
 sekigahara
 selene
 sera
-span
 staffordshire
 stations
 steinberg
@@ -20000,3 +19990,13 @@ saburo
 sagittata
 samguk
 sanborn
+sanna
+sarikamis
+savsat
+schizachyrium
+schlesinger
+sebinkarahisar
+senshi
+serik
+seriz
+serotina
diff --git a/tika-eval/src/main/resources/common_tokens/zh-cn b/tika-eval/src/main/resources/common_tokens/zh-cn
index 4cb5216..cd97411 100644
--- a/tika-eval/src/main/resources/common_tokens/zh-cn
+++ b/tika-eval/src/main/resources/common_tokens/zh-cn
@@ -17273,7 +17273,6 @@ thomas
 让她
 造业
 都拉
-style
 为从
 任妻
 但都
@@ -20000,3 +19999,4 @@ playstation
 为元
 为布
 了太
+他正
diff --git a/tika-eval/src/main/resources/common_tokens/zh-tw b/tika-eval/src/main/resources/common_tokens/zh-tw
index 6f85f5c..23be28f 100644
--- a/tika-eval/src/main/resources/common_tokens/zh-tw
+++ b/tika-eval/src/main/resources/common_tokens/zh-tw
@@ -17267,7 +17267,6 @@ thomas
 让她
 造业
 都拉
-style
 为从
 任妻
 但都
@@ -20000,3 +19999,4 @@ playstation
 是國
 有改
 有证
+树属