You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/11/23 14:39:44 UTC

svn commit: r1205395 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/txt/CharsetDetector.java test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java test/resources/test-documents/resume.html

Author: maxcom
Date: Wed Nov 23 13:39:43 2011
New Revision: 1205395

URL: http://svn.apache.org/viewvc?rev=1205395&view=rev
Log:
TIKA-787: Improve charset detection for UTF-8 HTML fragment

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1205395&r1=1205394&r2=1205395&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Wed Nov 23 13:39:43 2011
@@ -94,7 +94,7 @@ public class CharsetDetector {
         return this;
     }
     
-    private static final int kBufSize = 8000;
+    private static final int kBufSize = 12000;
 
     private static final int MAX_CONFIDENCE = 100;
 

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java?rev=1205395&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java Wed Nov 23 13:39:43 2011
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+public class CharsetDetectorTest {
+  @Test
+  public void testTagDropper() throws IOException {
+    InputStream in = CharsetDetectorTest.class.getResourceAsStream( "/test-documents/resume.html" );
+
+    try {
+      CharsetDetector detector = new CharsetDetector();
+      detector.enableInputFilter(true);
+      detector.setText(in);
+      CharsetMatch [] matches = detector.detectAll();
+      CharsetMatch mm = null;
+      for ( CharsetMatch m : matches ) {
+        if ( mm == null || mm.getConfidence() < m.getConfidence() ) {
+          mm = m;
+        }
+      }
+      assertTrue( mm != null );
+      assertEquals( "UTF-8", mm.getName() );
+    } finally {
+      in.close();
+    }
+  }
+}

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html?rev=1205395&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/resume.html Wed Nov 23 13:39:43 2011
@@ -0,0 +1,73 @@
+
+
+	<div class="js-helper">
+	<style type="text/css">#style_13209008630000000884_BODY{background-color:#FFFFFF;color:#000000;MARGIN:0px 1px;font-family:Tahoma,Arial,Verdana,Sans-Serif}#style_13209008630000000884 TD{font-size:13px;font-family:Tahoma,Arial,Verdana,Sans-Serif;vertical-align:top}#style_13209008630000000884 CAPTION{font-size:13px;font-weight:bold;text-align:left}#style_13209008630000000884 TR.style_13209008630000000884thead TD{font-weight:bold;text-align:center; padding-bottom:6px;padding-top:6px;padding-left:2px;padding-right:2px}#style_13209008630000000884 H1{font-size:24px;margin-bottom:15px;margin-top:5px;display:block;font-weight:normal;}#style_13209008630000000884 H2{font-size:22px;margin-bottom:5px;margin-top:5px;display:block;font-weight:normal;letter-spacing:1px}#style_13209008630000000884 H1.style_13209008630000000884in, #style_13209008630000000884 H2.style_13209008630000000884in, #style_13209008630000000884 H3.style_13209008630000000884in{font-size:100%;margin-bottom:0px;margin-to
 p:0px;display:inline;}#style_13209008630000000884 A, #style_13209008630000000884 A.style_13209008630000000884notvisited:visited, #style_13209008630000000884 .style_13209008630000000884notvisited A:visited, #style_13209008630000000884 .style_13209008630000000884menu A:visited{color:#00418F;text-decoration:none}#style_13209008630000000884 A:visited{color:#6699CC;text-decoration:none;}#style_13209008630000000884 A:hover, #style_13209008630000000884 A.style_13209008630000000884notvisited:hover, #style_13209008630000000884 .style_13209008630000000884notvisited A:hover, #style_13209008630000000884 .style_13209008630000000884menu A:hover{color:#990000;text-decoration:underline}#style_13209008630000000884 .style_13209008630000000884bold, #style_13209008630000000884 .style_13209008630000000884bold H1{font-weight:bold}#style_13209008630000000884 .style_13209008630000000884u{text-decoration:underline}#style_13209008630000000884 .style_13209008630000000884gray, #style_132090086300000008
 84 A.style_13209008630000000884gray:visited, #style_13209008630000000884 LEGEND{color:#7A7A7A}#style_13209008630000000884 .style_13209008630000000884red, #style_13209008630000000884 A.style_13209008630000000884red:visited{color:#C2311A}#style_13209008630000000884 EM, #style_13209008630000000884 .style_13209008630000000884imp, #style_13209008630000000884 .style_13209008630000000884field_warning{color:#C2311A;font-weight:bold;font-style:normal}#style_13209008630000000884 TABLE.style_13209008630000000884bl_table TR TD{padding:2px; padding-left:10px}#style_13209008630000000884 TD.style_13209008630000000884bl_row_name{color:#555; width:10%}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD, #style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD, #style_13209008630000000884 TD.style_13209008630000000884serve
 rdark, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD{text-align:center;padding-bottom:3px;padding-top:3px;padding-left:1px;padding-right:1px;font-weight:bold}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A:visited, #style_13209008630000000884 TD.style_13209008630000000884vacancydark A:hover, #style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD, #style_13209008630000000884 TD.style_13209008630000000884resumedark A, #style_13209008630000000884 TD.style_13209008630000000884resumedark A:visited, #style_13209008630000000884 TD.style_13209008630000000884resumedark A:hover, #style_13209008630000000884 TD.style_13209008630000000884serverda
 rk, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD, #style_13209008630000000884 TD.style_13209008630000000884serverdark A, #style_13209008630000000884 TD.style_13209008630000000884serverdark A:visited, #style_13209008630000000884 TD.style_13209008630000000884serverdark A:hover{color:#000000;}#style_13209008630000000884 TD.style_13209008630000000884vacancydark, #style_13209008630000000884 TR.style_13209008630000000884vacancydark TD{background-color:#FFDDBB;}#style_13209008630000000884 TD.style_13209008630000000884vacancylight, #style_13209008630000000884 TR.style_13209008630000000884vacancylight TD{background-color:#FFF5EC}#style_13209008630000000884 TD.style_13209008630000000884resumedark, #style_13209008630000000884 TR.style_13209008630000000884resumedark TD{background-color:#D3E9E9;}#style_13209008630000000884 TD.style_13209008630000000884resumelight, #style_13209008630000000884 TR.style_13209008630000000884resumelight TD{background-color:#ECF8F7}#s
 tyle_13209008630000000884 TD.style_13209008630000000884serverdark, #style_13209008630000000884 TR.style_13209008630000000884serverdark TD{background-color:#ABC2D5;}#style_13209008630000000884 TR.style_13209008630000000884serverlight TD, #style_13209008630000000884 TD.style_13209008630000000884serverlight{background-color:#E2EBF5}#style_13209008630000000884 TD.style_13209008630000000884blankheader1{font-size:24px; padding:10px}#style_13209008630000000884 TD.style_13209008630000000884blankheader2{font-size:22px; padding:10px}#style_13209008630000000884 TABLE.style_13209008630000000884resumelist TR.thead TD{background-color:#ABC2D5;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist TR.thead TD{background-color:#ABC2D5;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TR.thead TD{background-color:#DBDBDB;}#style_13209008630000000884 TABLE TR.style_13209008630000000884wr TD{background-color:#FFFFFF}#style_13209008630000000884 TABLE.st
 yle_13209008630000000884vaclist_for_mail TD{border-bottom:#DBDBDB 1px solid}#style_13209008630000000884 .style_13209008630000000884list TR TD{background-color:#E2EBF5;padding:5px}#style_13209008630000000884 .style_13209008630000000884list TR.thead TD{background-color:#ABC2D5;color:#555555;text-align:center; padding-bottom:8px;padding-top:8px;padding-left:1px;padding-right:1px;font-weight:bold;}#style_13209008630000000884 .style_13209008630000000884list TR.wr TD{background-color:#F3F7FB}#style_13209008630000000884 A.style_13209008630000000884list_details, #style_13209008630000000884 A.style_13209008630000000884list_details:visited, #style_13209008630000000884 A.style_13209008630000000884list_details:hover{color:#7A7A7A;text-decoration:none;line-height:120%}#style_13209008630000000884 TD.style_13209008630000000884cell, #style_13209008630000000884 TD.style_13209008630000000884c{padding-top:3px;padding-left:5px;padding-right:5px}#style_13209008630000000884 BIG{font-size:24px}#st
 yle_13209008630000000884 .style_13209008630000000884small, #style_13209008630000000884 SMALL{font-size:85%}#style_13209008630000000884 UL{margin-left:25px;margin-bottom:0px}#style_13209008630000000884 TD.style_13209008630000000884small, #style_13209008630000000884 .style_13209008630000000884verysmall, #style_13209008630000000884 .style_13209008630000000884verysmall INPUT, #style_13209008630000000884 .style_13209008630000000884verysmall SELECT{font-size:11px}#style_13209008630000000884 DIV.style_13209008630000000884localmenu{padding-top:10px;margin-bottom:15px;}#style_13209008630000000884 DIV.style_13209008630000000884localmenu A, #style_13209008630000000884 DIV.style_13209008630000000884localmenu A:visited{text-decoration:underline;font-weight:bold}#style_13209008630000000884 DIV.style_13209008630000000884comment{font-size:85%; background-color:#DDFFDD; padding:4px; border:1px solid #CCC;cursor:default;}#style_13209008630000000884 HR{color:#ABC2D5;background-color:#ABC2D5;he
 ight:1px;border:0px solid #ABC2D5}#style_13209008630000000884 DIV.style_13209008630000000884dotsline{font-size:1px; margin-top:4px; margin-bottom:5px; border-bottom:#BACBD7 1px dotted}#style_13209008630000000884 TABLE.style_13209008630000000884rctable TR TD{background-color:#E5EDF7;}#style_13209008630000000884 TD.style_13209008630000000884rc1{padding-top:10px; padding-left:10px;}#style_13209008630000000884 TD.style_13209008630000000884rc2{font-size:1px; width:10px;}#style_13209008630000000884 TD.style_13209008630000000884rc3{height:10px; font-size:1px;}#style_13209008630000000884 TD.style_13209008630000000884rc4{height:10px; font-size:1px;}#style_13209008630000000884 SPAN.style_13209008630000000884super{color:#003398;font-size:150%}#style_13209008630000000884 SPAN.style_13209008630000000884job{color:#FF0000;font-size:150%}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button{background-color:#99cc00; margin:0px 5px 3px 0px;}#sty
 le_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD{background-color:#99cc00; font-weight:normal; color:#ffffff; border-bottom:0px; padding-top:6px; padding-right:7px; padding-bottom:6px; padding-left:7px; vertical-align:middle; text-align:center;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD A, #style_13209008630000000884 TABLE.style_13209008630000000884to_site_button TD A:visited{color:#ffffff; text-decoration:none; font-weight:normal;}#style_13209008630000000884 TABLE.style_13209008630000000884vaclist_for_mail TD TABLE.to_site_button TD A:hover{color:#ffffff; text-decoration:underline; font-weight:normal;}#style_13209008630000000884 .style_13209008630000000884row{clear:left; padding-bottom:4px;}#style_13209008630000000884 .style_13209008630000000884row2{margin-bottom:8px;}#style_13209008630000000884 .style_13209008630000000884col1{float:left; width:140px; color:#55555
 5; margin-right:-145px;}#style_13209008630000000884 .style_13209008630000000884col2{margin-left:145px;}#style_13209008630000000884 DIV.style_13209008630000000884resume_rightcol{float:right; width:280px; margin:0px 0px 10px 30px;}#style_13209008630000000884 DIV.style_13209008630000000884blankheader1{font-size:190%;}
+</style>
+	<div id="style_13209008630000000884" class="mr_read__body">
+		<base target="_self" href="http://e.mail.ru/cgi-bin/" />
+		
+			<div id="style_13209008630000000884_BODY">
+
+
+
+<style type="text/css" ></style>
+
+
+<table width="100%" cellspacing="0" cellpadding="0" height="100%" border="0" >
+<tr ><td >
+
+</td></tr>
+<tr ><td style="padding:5px" height="100%" >
+Здравствуйте, !<br >
+<br >
+Предлагаем Вам ознакомиться со списком зарегистрированных компаний, представители которых просмотрели Ваше резюме за последние сутки.<br >
+<br >
+<li ><a target="_blank" href="/cgi-bin/link?check=1&cnf=710139&url=http%3A%2F%2;0,0" >Компании, просмотревшие резюме № .</a> Новые: <b >1.</b></li><br >
+<br >
+Эти сведения предоставляются Вам исключительно для информации. Вы можете оперативно отслеживать, какие именно компании нашли в базе данных Superjob Ваше резюме и заинтересовались им.<br >
+<br >
+Если Ваше резюме размещено в закрытом доступе, то его могут просматривать только те работодатели, которым Вы отправили его самостоятельно.<br >
+Историю отправки своего резюме Вы можете посмотреть по ссылке «История рассылки резюме».<br >
+<br >
+<br >
+<b >Внимание!</b><br >
+В процессе поиска работы Вы можете столкнуться с такими предложениями работодателей или кадровых агентств, в которых Вас будут просить внести оплату (за предварительное обучение, за оформление документов, за оформление обязательной страховки, на закупку первой партии продукции 
 компании, предназначенной для продажи и т.п.) или предоставить отсканированные копии документов (паспорта, военного билета, трудовой книжки, водительских прав, пенсионного удостоверния и т.п.) для якобы предварительного оформления или подтверждения данных, указанных в Вашем резюме
 .<br >
+Это один из признаков мошенничества! Мы рекомендуем Вам очень осторожно относиться к таким предложениям и по возможности избегать собеседований с подобными работодателями.<br >
+<br >
+Также мы настоятельно не рекомендуем отправлять платные SMS-сообщения на короткие номера для получения контактов или другой информации о вакансии или же для получения результатов тестирования. С организациями, которые оказывают подобные услуги, мы не сотрудничаем и предупреждаеÐ�
 �, что это тоже один из приемов мошенничества.<br >
+<br >
+<br >
+<em >x</em> <a target="_blank" href="/cgi-bin/link?check=1&cnf=8d972a&url=http%3A%2F%2Fwww.sup;0,0" >Отключить уведомления о новых просмотрах моих резюме</a><br >
+<br >
+По ссылкам в этом письме можно войти в систему без ввода пароля.
+<br ><br >
+</td>
+</tr>
+<tr >
+<td >
+<span class="style_13209008630000000884noprint" ><br ><br >Если у Вас есть пожелания и идеи по улучшению сервиса Superjob, пожалуйста, <a target="_blank" href="/cgi-bin/link?check=1;0,0" >напишите нам</a>.<br ><br ></span>
+<table width="100%" cellspacing="0" cellpadding="10" border="0" class="style_13209008630000000884noprint" >
+<tr ><td align="center" style="border-top:1px solid #BACBD7;" >
+<a target="_blank" href="/cgi-bin/link?check=1&cnf=8fa2f9&url=http%3A%2F%2Fwww.;0,0" ><big >Superjob — Работа должна доставлять удовольствие!</big></a>
+</td></tr>
+</table>
+<table width="100%" cellspacing="1" cellpadding="0" border="0" class="style_13209008630000000884noprint" >
+<tr ><td align="center" style="padding:5px" >
+<span style="color:#999999;font-size:8pt;" >Письмо отправлено: xx.xx.xxxx xx:xx:xx</span>
+</td></tr>
+</table>
+
+</td></tr>
+</table>
+
+
+
+</div>
+			
+		
+		<base target="_self" href="http://e.mail.ru/cgi-bin/" />
+	</div>
+</div>
+
+
+