You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/11/23 14:39:44 UTC
svn commit: r1205395 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/txt/CharsetDetector.java
test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
test/resources/test-documents/resume.html
Author: maxcom
Date: Wed Nov 23 13:39:43 2011
New Revision: 1205395
URL: http://svn.apache.org/viewvc?rev=1205395&view=rev
Log:
TIKA-787: Improve charset detection for UTF-8 HTML fragment
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1205395&r1=1205394&r2=1205395&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Wed Nov 23 13:39:43 2011
@@ -94,7 +94,7 @@ public class CharsetDetector {
return this;
}
- private static final int kBufSize = 8000;
+ private static final int kBufSize = 12000;
private static final int MAX_CONFIDENCE = 100;
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1205395&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Wed Nov 23 13:39:43 2011
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+public class CharsetDetectorTest {
+ @Test
+ public void testTagDropper() throws IOException {
+ InputStream in = CharsetDetectorTest.class.getResourceAsStream( "/test-documents/resume.html" );
+
+ try {
+ CharsetDetector detector = new CharsetDetector();
+ detector.enableInputFilter(true);
+ detector.setText(in);
+ CharsetMatch [] matches = detector.detectAll();
+ CharsetMatch mm = null;
+ for ( CharsetMatch m : matches ) {
+ if ( mm == null || mm.getConfidence() < m.getConfidence() ) {
+ mm = m;
+ }
+ }
+ assertTrue( mm != null );
+ assertEquals( "UTF-8", mm.getName() );
+ } finally {
+ in.close();
+ }
+ }
+}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html?rev=1205395&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html Wed Nov 23 13:39:43 2011
@@ -0,0 +1,73 @@
+
+
+ <div class="js-helper">
+ <style type="text/css">#style_13209008630000000884_BODY{background-color:#FFFFFF;color:#000000;MARGIN:0px 1px;font-family:Tahoma,Arial,Verdana,Sans-Serif}#style_13209008630000000884 TD{font-size:13px;font-family:Tahoma,Arial,Verdana,Sans-Serif;vertical-align:top}#style_13209008630000000884 CAPTION{font-size:13px;font-weight:bold;text-align:left}#style_13209008630000000884 TR.style_13209008630000000884thead TD{font-weight:bold;text-align:center; padding-bottom:6px;padding-top:6px;padding-left:2px;padding-right:2px}#style_13209008630000000884 H1{font-size:24px;margin-bottom:15px;margin-top:5px;display:block;font-weight:normal;}#style_13209008630000000884 H2{font-size:22px;margin-bottom:5px;margin-top:5px;display:block;font-weight:normal;letter-spacing:1px}#style_13209008630000000884 H1.style_13209008630000000884in, #style_13209008630000000884 H2.style_13209008630000000884in, #style_13209008630000000884 H3.style_13209008630000000884in{font-size:100%;margin-bottom:0px;margin-to
p:0px;display:inline;}#style_13209008630000000884 A, #style_13209008630000000884 A.style_13209008630000000884notvisited:visited, #style_13209008630000000884 .style_13209008630000000884notvisited A:visited, #style_13209008630000000884 .style_13209008630000000884menu A:visited{color:#00418F;text-decoration:none}#style_13209008630000000884 A:visited{color:#6699CC;text-decoration:none;}#style_13209008630000000884 A:hover, #style_13209008630000000884 A.style_13209008630000000884notvisited:hover, #style_13209008630000000884 .style_13209008630000000884notvisited A:hover, #style_13209008630000000884 .style_13209008630000000884menu A:hover{color:#990000;text-decoration:underline}#style_13209008630000000884 .style_13209008630000000884bold, #style_13209008630000000884 .style_13209008630000000884bold H1{font-weight:bold}#style_13209008630000000884 .style_13209008630000000884u{text-decoration:underline}#style_13209008630000000884 .style_13209008630000000884gray, #style_132090086300000008
84 A.style_13209008630000000884gray:visited, #style_13209008630000000884 LEGEND{color:#7A7A7A}#style_13209008630000000884 .style_13209008630000000884red, #style_13209008630000000884 A.style_13209008630000000884red:visited{color:#C2311A}#style_13209008630000000884 EM, #style_13209008630000000884 .style_13209008630000000884imp, #style_13209008630000000884 .style_13209008630000000884field_warning{color:#C2311A;font-weight:bold;font-style:normal}#style_13209008630000000884 TABLE.style_13209008630000000884bl_table TR TD{padding:2px; padding-left:10px}#style_13209008630000000884 TD.style_13209008630000000884bl_row_name{color:#555; width:10%}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD, #style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD, #style_13209008630000000884 TD.style_13209008630000000884serve
rdark, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD{text-align:center;padding-bottom:3px;padding-top:3px;padding-left:1px;padding-right:1px;font-weight:bold}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A:visited, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A:hover, #style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD, #style_13209008630000000884 TD.style_13209008630000000884resumedark A, #style_13209008630000000884 TD.style_13209008630000000884resumedark A:visited, #style_13209008630000000884 TD.style_13209008630000000884resumedark A:hover, #style_13209008630000000884 TD.style_13209008630000000884serverda
rk, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD, #style_13209008630000000884 TD.style_13209008630000000884serverdark A, #style_13209008630000000884 TD.style_13209008630000000884serverdark A:visited, #style_13209008630000000884 TD.style_13209008630000000884serverdark A:hover{color:#000000;}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD{background-color:#FFDDBB;}#style_13209008630000000884 TD.style_13209008630000000884vacancylight, #style_13209008630000000884 TR.style_13209008630000000884vacancylight TD{background-color:#FFF5EC}#style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD{background-color:#D3E9E9;}#style_13209008630000000884 TD.style_13209008630000000884resumelight, #style_13209008630000000884 TR.style_13209008630000000884resumelight TD{background-color:#ECF8F7}#s
tyle_13209008630000000884 TD.style_13209008630000000884serverdark, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD{background-color:#ABC2D5;}#style_13209008630000000884 TR.style_13209008630000000884serverlight TD, #style_13209008630000000884 TD.style_13209008630000000884serverlight{background-color:#E2EBF5}#style_13209008630000000884 TD.style_13209008630000000884blankheader1{font-size:24px; padding:10px}#style_13209008630000000884 TD.style_13209008630000000884blankheader2{font-size:22px; padding:10px}#style_13209008630000000884 TABLE.style_13209008630000000884resumelist TR.thead TD{background-color:#ABC2D5;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist TR.thead TD{background-color:#ABC2D5;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TR.thead TD{background-color:#DBDBDB;}#style_13209008630000000884 TABLE TR.style_13209008630000000884wr TD{background-color:#FFFFFF}#style_13209008630000000884 TABLE.st
yle_13209008630000000884vaclist_for_mail TD{border-bottom:#DBDBDB 1px solid}#style_13209008630000000884 .style_13209008630000000884list TR TD{background-color:#E2EBF5;padding:5px}#style_13209008630000000884 .style_13209008630000000884list TR.thead TD{background-color:#ABC2D5;color:#555555;text-align:center; padding-bottom:8px;padding-top:8px;padding-left:1px;padding-right:1px;font-weight:bold;}#style_13209008630000000884 .style_13209008630000000884list TR.wr TD{background-color:#F3F7FB}#style_13209008630000000884 A.style_13209008630000000884list_details, #style_13209008630000000884 A.style_13209008630000000884list_details:visited, #style_13209008630000000884 A.style_13209008630000000884list_details:hover{color:#7A7A7A;text-decoration:none;line-height:120%}#style_13209008630000000884 TD.style_13209008630000000884cell, #style_13209008630000000884 TD.style_13209008630000000884c{padding-top:3px;padding-left:5px;padding-right:5px}#style_13209008630000000884 BIG{font-size:24px}#st
yle_13209008630000000884 .style_13209008630000000884small, #style_13209008630000000884 SMALL{font-size:85%}#style_13209008630000000884 UL{margin-left:25px;margin-bottom:0px}#style_13209008630000000884 TD.style_13209008630000000884small, #style_13209008630000000884 .style_13209008630000000884verysmall, #style_13209008630000000884 .style_13209008630000000884verysmall INPUT, #style_13209008630000000884 .style_13209008630000000884verysmall SELECT{font-size:11px}#style_13209008630000000884 DIV.style_13209008630000000884localmenu{padding-top:10px;margin-bottom:15px;}#style_13209008630000000884 DIV.style_13209008630000000884localmenu A, #style_13209008630000000884 DIV.style_13209008630000000884localmenu A:visited{text-decoration:underline;font-weight:bold}#style_13209008630000000884 DIV.style_13209008630000000884comment{font-size:85%; background-color:#DDFFDD; padding:4px; border:1px solid #CCC;cursor:default;}#style_13209008630000000884 HR{color:#ABC2D5;background-color:#ABC2D5;he
ight:1px;border:0px solid #ABC2D5}#style_13209008630000000884 DIV.style_13209008630000000884dotsline{font-size:1px; margin-top:4px; margin-bottom:5px; border-bottom:#BACBD7 1px dotted}#style_13209008630000000884 TABLE.style_13209008630000000884rctable TR TD{background-color:#E5EDF7;}#style_13209008630000000884 TD.style_13209008630000000884rc1{padding-top:10px; padding-left:10px;}#style_13209008630000000884 TD.style_13209008630000000884rc2{font-size:1px; width:10px;}#style_13209008630000000884 TD.style_13209008630000000884rc3{height:10px; font-size:1px;}#style_13209008630000000884 TD.style_13209008630000000884rc4{height:10px; font-size:1px;}#style_13209008630000000884 SPAN.style_13209008630000000884super{color:#003398;font-size:150%}#style_13209008630000000884 SPAN.style_13209008630000000884job{color:#FF0000;font-size:150%}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button{background-color:#99cc00; margin:0px 5px 3px 0px;}#sty
le_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD{background-color:#99cc00; font-weight:normal; color:#ffffff; border-bottom:0px; padding-top:6px; padding-right:7px; padding-bottom:6px; padding-left:7px; vertical-align:middle; text-align:center;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD A, #style_13209008630000000884 TABLE.style_13209008630000000884to_site_button TD A:visited{color:#ffffff; text-decoration:none; font-weight:normal;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD A:hover{color:#ffffff; text-decoration:underline; font-weight:normal;}#style_13209008630000000884 .style_13209008630000000884row{clear:left; padding-bottom:4px;}#style_13209008630000000884 .style_13209008630000000884row2{margin-bottom:8px;}#style_13209008630000000884 .style_13209008630000000884col1{float:left; width:140px; color:#55555
5; margin-right:-145px;}#style_13209008630000000884 .style_13209008630000000884col2{margin-left:145px;}#style_13209008630000000884 DIV.style_13209008630000000884resume_rightcol{float:right; width:280px; margin:0px 0px 10px 30px;}#style_13209008630000000884 DIV.style_13209008630000000884blankheader1{font-size:190%;}
+</style>
+ <div id="style_13209008630000000884" class="mr_read__body">
+ <base target="_self" href="http://e.mail.ru/cgi-bin/" />
+
+ <div id="style_13209008630000000884_BODY">
+
+
+
+<style type="text/css" ></style>
+
+
+<table width="100%" cellspacing="0" cellpadding="0" height="100%" border="0" >
+<tr ><td >
+
+</td></tr>
+<tr ><td style="padding:5px" height="100%" >
+ÐдÑавÑÑвÑйÑе, !<br >
+<br >
+ÐÑедлагаем Ðам ознакомиÑÑÑÑ Ñо ÑпиÑком заÑегиÑÑÑиÑованнÑÑ
компаний, пÑедÑÑавиÑели коÑоÑÑÑ
пÑоÑмоÑÑели ÐаÑе ÑезÑме за поÑледние ÑÑÑки.<br >
+<br >
+<li ><a target="_blank" href="/cgi-bin/link?check=1&cnf=710139&url=http%3A%2F%2;0,0" >Ðомпании, пÑоÑмоÑÑевÑие ÑезÑме â .</a> ÐовÑе: <b >1.</b></li><br >
+<br >
+ÐÑи ÑÐ²ÐµÐ´ÐµÐ½Ð¸Ñ Ð¿ÑедоÑÑавлÑÑÑÑÑ Ðам иÑклÑÑиÑелÑно Ð´Ð»Ñ Ð¸Ð½ÑоÑмаÑии. ÐÑ Ð¼Ð¾Ð¶ÐµÑе опеÑаÑивно оÑÑлеживаÑÑ, какие именно компании наÑли в базе даннÑÑ
Superjob ÐаÑе ÑезÑме и заинÑеÑеÑовалиÑÑ Ð¸Ð¼.<br >
+<br >
+ÐÑли ÐаÑе ÑезÑме ÑазмеÑено в закÑÑÑом доÑÑÑпе, Ñо его могÑÑ Ð¿ÑоÑмаÑÑиваÑÑ ÑолÑко Ñе ÑабоÑодаÑели, коÑоÑÑм ÐÑ Ð¾ÑпÑавили его ÑамоÑÑоÑÑелÑно.<br >
+ÐÑÑоÑÐ¸Ñ Ð¾ÑпÑавки Ñвоего ÑезÑме ÐÑ Ð¼Ð¾Ð¶ÐµÑе поÑмоÑÑеÑÑ Ð¿Ð¾ ÑÑÑлке «ÐÑÑоÑÐ¸Ñ ÑаÑÑÑлки ÑезÑме».<br >
+<br >
+<br >
+<b >Ðнимание!</b><br >
+РпÑоÑеÑÑе поиÑка ÑабоÑÑ ÐÑ Ð¼Ð¾Ð¶ÐµÑе ÑÑолкнÑÑÑÑÑ Ñ Ñакими пÑедложениÑми ÑабоÑодаÑелей или кадÑовÑÑ
агенÑÑÑв, в коÑоÑÑÑ
ÐÐ°Ñ Ð±ÑдÑÑ Ð¿ÑоÑиÑÑ Ð²Ð½ÐµÑÑи оплаÑÑ (за пÑедваÑиÑелÑное обÑÑение, за оÑоÑмление докÑменÑов, за оÑоÑмление обÑзаÑелÑной ÑÑÑаÑ
овки, на закÑÐ¿ÐºÑ Ð¿ÐµÑвой паÑÑии пÑодÑкÑии
компании, пÑедназнаÑенной Ð´Ð»Ñ Ð¿Ñодажи и Ñ.п.) или пÑедоÑÑавиÑÑ Ð¾ÑÑканиÑованнÑе копии докÑменÑов (паÑпоÑÑа, военного билеÑа, ÑÑÑдовой книжки, водиÑелÑÑкиÑ
пÑав, пенÑионного ÑдоÑÑовеÑÐ½Ð¸Ñ Ð¸ Ñ.п.) Ð´Ð»Ñ ÑÐºÐ¾Ð±Ñ Ð¿ÑедваÑиÑелÑного оÑоÑÐ¼Ð»ÐµÐ½Ð¸Ñ Ð¸Ð»Ð¸ подÑвеÑÐ¶Ð´ÐµÐ½Ð¸Ñ Ð´Ð°Ð½Ð½ÑÑ
, ÑказаннÑÑ
в ÐаÑем ÑезÑме
.<br >
+ÐÑо один из пÑизнаков моÑенниÑеÑÑва! ÐÑ ÑекомендÑем Ðам оÑÐµÐ½Ñ Ð¾ÑÑоÑожно оÑноÑиÑÑÑÑ Ðº Ñаким пÑедложениÑм и по возможноÑÑи избегаÑÑ ÑобеÑедований Ñ Ð¿Ð¾Ð´Ð¾Ð±Ð½Ñми ÑабоÑодаÑелÑми.<br >
+<br >
+Также Ð¼Ñ Ð½Ð°ÑÑоÑÑелÑно не ÑекомендÑем оÑпÑавлÑÑÑ Ð¿Ð»Ð°ÑнÑе SMS-ÑообÑÐµÐ½Ð¸Ñ Ð½Ð° коÑоÑкие номеÑа Ð´Ð»Ñ Ð¿Ð¾Ð»ÑÑÐµÐ½Ð¸Ñ ÐºÐ¾Ð½ÑакÑов или дÑÑгой инÑоÑмаÑии о ваканÑии или же Ð´Ð»Ñ Ð¿Ð¾Ð»ÑÑÐµÐ½Ð¸Ñ ÑезÑлÑÑаÑов ÑеÑÑиÑованиÑ. С оÑганизаÑиÑми, коÑоÑÑе оказÑваÑÑ Ð¿Ð¾Ð´Ð¾Ð±Ð½Ñе ÑÑлÑги, Ð¼Ñ Ð½Ðµ ÑоÑÑÑдниÑаем и пÑедÑпÑеждаеÐ�
�, ÑÑо ÑÑо Ñоже один из пÑиемов моÑенниÑеÑÑва.<br >
+<br >
+<br >
+<em >x</em> <a target="_blank" href="/cgi-bin/link?check=1&cnf=8d972a&url=http%3A%2F%2Fwww.sup;0,0" >ÐÑклÑÑиÑÑ ÑÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð¾ новÑÑ
пÑоÑмоÑÑаÑ
моиÑ
ÑезÑме</a><br >
+<br >
+Ðо ÑÑÑлкам в ÑÑом пиÑÑме можно войÑи в ÑиÑÑÐµÐ¼Ñ Ð±ÐµÐ· ввода паÑолÑ.
+<br ><br >
+</td>
+</tr>
+<tr >
+<td >
+<span class="style_13209008630000000884noprint" ><br ><br >ÐÑли Ñ ÐÐ°Ñ ÐµÑÑÑ Ð¿Ð¾Ð¶ÐµÐ»Ð°Ð½Ð¸Ñ Ð¸ идеи по ÑлÑÑÑÐµÐ½Ð¸Ñ ÑеÑвиÑа Superjob, пожалÑйÑÑа, <a target="_blank" href="/cgi-bin/link?check=1;0,0" >напиÑиÑе нам</a>.<br ><br ></span>
+<table width="100%" cellspacing="0" cellpadding="10" border="0" class="style_13209008630000000884noprint" >
+<tr ><td align="center" style="border-top:1px solid #BACBD7;" >
+<a target="_blank" href="/cgi-bin/link?check=1&cnf=8fa2f9&url=http%3A%2F%2Fwww.;0,0" ><big >Superjob â РабоÑа должна доÑÑавлÑÑÑ ÑдоволÑÑÑвие!</big></a>
+</td></tr>
+</table>
+<table width="100%" cellspacing="1" cellpadding="0" border="0" class="style_13209008630000000884noprint" >
+<tr ><td align="center" style="padding:5px" >
+<span style="color:#999999;font-size:8pt;" >ÐиÑÑмо оÑпÑавлено: xx.xx.xxxx xx:xx:xx</span>
+</td></tr>
+</table>
+
+</td></tr>
+</table>
+
+
+
+</div>
+
+
+ <base target="_self" href="http://e.mail.ru/cgi-bin/" />
+ </div>
+</div>
+
+
+