You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/02/03 18:27:09 UTC

[1/4] tika git commit: Test JS file that includes
Repository: tika
Updated Branches:
  refs/heads/master 1e0159b73 -> 6c0b7906e


Test JS file that includes <html in it, based on JS from the ComDev website TIKA-1141


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d8a2fc01
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d8a2fc01
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d8a2fc01

Branch: refs/heads/master
Commit: d8a2fc01b4da5ffb7be19864512401c54aa04bfd
Parents: 046e43f
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Feb 3 17:10:33 2016 +0000
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Feb 3 17:10:33 2016 +0000

----------------------------------------------------------------------
 .../resources/test-documents/testJS_HTML.js     | 91 ++++++++++++++++++++
 1 file changed, 91 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d8a2fc01/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testJS_HTML.js b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
new file mode 100644
index 0000000..a362198
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testJS_HTML.js
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+var places = new Array();
+
+places[0] = {
+   'name': 'Oxford', lat: 51.75222, lng: -1.25596,
+   'id': 'map_1',
+}
+places[1] = {
+   'name': 'Oxford', lat: 41.43399, lng: -73.11678,
+   'id': 'map_2',
+}
+places[2] = {
+   'name': 'Oxford', lat: -43.3, lng: 172.18333,
+   'id': 'map_3',
+}
+places[3] = {
+   'name': 'Oxford', lat: 33.619, lng: -83.86741,
+   'id': 'map_4',
+}
+places[4] = {
+   'name': 'Oxford', lat: 44.13174, lng: -70.49311,
+   'id': 'map_5',
+}
+places[5] = {
+   'name': 'Oxford', lat: 39.78539, lng: -75.97883,
+   'id': 'map_6',
+}
+places[6] = {
+   'name': 'Oxford', lat: 40.51976, lng: -87.24779,
+   'id': 'map_7',
+}
+places[7] = {
+   'name': 'Oxford', lat: 45.73345, lng: -63.86542,
+   'id': 'map_8',
+}
+places[8] = {
+   'name': 'Oxford', lat: 42.44202, lng: -75.59769,
+   'id': 'map_9',
+}
+places[9] = {
+   'name': 'Oxford', lat: 40.80315, lng: -74.98962,
+   'id': 'map_10',
+}
+
+function drawMaps() {
+   if (GBrowserIsCompatible()) {
+      for(var i in places) {
+         var p = places[i];
+         var div = document.getElementById(p['id']);
+
+         div.style.display = "block";
+         div.parentNode.style.marginBottom = "35px";
+
+         var map = new GMap2(div);
+         map.setCenter(new GLatLng(p['lat'], p['lng']), 8);
+
+         var m = new GMarker( 
+            new GLatLng(p['lat'], p['lng']),
+            {title: p['name']}
+         );
+         map.addOverlay(m);
+      }
+   } else {
+      document.write("<!doctype><html><body><h1>Unsupported Browser</h1></body></html>");
+   }
+}
+
+var t;
+$(document).ready(function(){
+      t = setTimeout(function() {
+         clearTimeout(t);
+         drawMaps();
+      }, 15*1000);
+});


[2/4] tika git commit: Lower the priority of Posted by ni...@apache.org.
Lower the priority of <html later in the file header

Keep the current higher magic priority for <html near the start of the
file header for detecting HTML files, but drop it for later in the file
header, to help avoid false positives, and aid with TIKA-1141


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d740f5d8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d740f5d8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d740f5d8

Branch: refs/heads/master
Commit: d740f5d8b2e42b1db42806ddd395e034cb416fd4
Parents: d8a2fc0
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Feb 3 17:11:06 2016 +0000
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Feb 3 17:11:06 2016 +0000

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml        | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d740f5d8/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 1d7b42b..95f41e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5432,12 +5432,6 @@
       <match value="&lt;head" type="string" offset="0:64"/>
       <match value="&lt;TITLE" type="string" offset="0:64"/>
       <match value="&lt;title" type="string" offset="0:64"/>
-      <!-- note on the offset value here: this can only be as big as
-           MimeTypes#getMinLength(). If you set the offset value to larger
-           than that size, the magic will only be compared to up to
-           MimeTypes#getMinLength() bytes.
-       -->
-      <match value="&lt;html" type="string" offset="0:8192"/>
       <match value="&lt;HTML" type="string" offset="0:64"/>
       <match value="&lt;BODY" type="string" offset="0"/>
       <match value="&lt;body" type="string" offset="0"/>
@@ -5449,6 +5443,17 @@
       <match value="&lt;H1" type="string" offset="0"/>
       <match value="&lt;!doctype HTML" type="string" offset="0"/>
       <match value="&lt;!DOCTYPE html" type="string" offset="0"/>
+      <match value="&lt;html" type="string" offset="0:128"/>
+    </magic>
+    <magic priority="20">
+      <!-- Lower priority match for <html anywhere near the top of the file -->
+      <!-- note on the offset value here: this can only be as big as
+           MimeTypes#getMinLength(). If you set the offset value to larger
+           than that size, the magic will only be compared to up to
+           MimeTypes#getMinLength() bytes. It should also only start after
+           the higher priority "start of file" one above
+       -->
+      <match value="&lt;html" type="string" offset="128:8192"/>
     </magic>
     <glob pattern="*.html"/>
     <glob pattern="*.htm"/>


[3/4] tika git commit: Unit test for detecting JS files

Posted by ni...@apache.org.
Unit test for detecting JS files

As we don't currently have any JS file magic, we can't detect
as such without the file name. However, with the filename, ensure
we do get it right, even if there's HTML snippet in the JS. TIKA-1141


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/557b3704
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/557b3704
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/557b3704

Branch: refs/heads/master
Commit: 557b3704501a9692809a3e1b7838866786ed3366
Parents: d740f5d
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Feb 3 17:20:55 2016 +0000
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Feb 3 17:25:52 2016 +0000

----------------------------------------------------------------------
 .../test/java/org/apache/tika/mime/TestMimeTypes.java   | 12 ++++++++++++
 1 file changed, 12 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/557b3704/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 77d25df..92f7b88 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -971,6 +971,18 @@ public class TestMimeTypes {
         assertTypeByData("text/x-matlab", "testMATLAB.m");
         assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
         assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
+        
+        // By name, or by name+data, gets it as JS
+        assertTypeByName("application/javascript", "testJS.js");
+        assertTypeByName("application/javascript", "testJS_HTML.js");
+        assertType("application/javascript", "testJS.js");
+        assertType("application/javascript", "testJS_HTML.js");
+        
+        // With data only, because we have no JS file magic, can't be
+        //  detected. One will come through as plain text, the other
+        //  as HTML due to <html> in it. TODO Add JS magic. See TIKA-1141 
+        //assertTypeByData("application/javascript", "testJS.js");
+        //assertTypeByData("application/javascript", "testJS_HTML.js");
     }
 
     @Test


[4/4] tika git commit: Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika

Posted by ni...@apache.org.
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/tika


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c0b7906
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c0b7906
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c0b7906

Branch: refs/heads/master
Commit: 6c0b7906ecbc22ea9adb4c1e5781b0eff561957d
Parents: 557b370 1e0159b
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Feb 3 17:26:09 2016 +0000
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Feb 3 17:26:09 2016 +0000

----------------------------------------------------------------------
 .../tika/parser/rtf/RTFEmbObjHandler.java       |   2 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   |   9 +-
 .../tika/server/resource/TikaResource.java      |  14 +-
 .../apache/tika/server/TikaResourceTest.java    |  12 +
 .../testRTF_npeFromWMFInTikaServer.rtf          | 235 +++++++++++++++++++
 5 files changed, 262 insertions(+), 10 deletions(-)
----------------------------------------------------------------------