You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by hw...@apache.org on 2010/08/04 05:08:03 UTC

svn commit: r982108 - in /labs/mouse: guesser/ guesser/__init__.py guesser/binary.py tests/test_mouse.py

Author: hwright
Date: Wed Aug  4 03:08:03 2010
New Revision: 982108

URL: http://svn.apache.org/viewvc?rev=982108&view=rev
Log:
Add a module and some tests to Mouse which help determine if a file is binary
or not.  There is still some work to do, but this is a good start.

Added:
    labs/mouse/guesser/   (with props)
    labs/mouse/guesser/__init__.py
    labs/mouse/guesser/binary.py
Modified:
    labs/mouse/tests/test_mouse.py

Propchange: labs/mouse/guesser/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Aug  4 03:08:03 2010
@@ -0,0 +1 @@
+*.pyc

Added: labs/mouse/guesser/__init__.py
URL: http://svn.apache.org/viewvc/labs/mouse/guesser/__init__.py?rev=982108&view=auto
==============================================================================
--- labs/mouse/guesser/__init__.py (added)
+++ labs/mouse/guesser/__init__.py Wed Aug  4 03:08:03 2010
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+'''A module to help Mouse guess the different kinds of files.
+
+Helper methods are defined here so that consumers only need import this
+module, rather than all the child modules.'''
+
+import binary
+
+def is_binary(item):
+  return binary.is_binary(item)

Added: labs/mouse/guesser/binary.py
URL: http://svn.apache.org/viewvc/labs/mouse/guesser/binary.py?rev=982108&view=auto
==============================================================================
--- labs/mouse/guesser/binary.py (added)
+++ labs/mouse/guesser/binary.py Wed Aug  4 03:08:03 2010
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+#
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+'''Module to determine if a file with given content is binary.'''
+
+import os
+import mimetypes
+
+
+_data_exts = [
+    'dat', 'doc', 'ncb', 'idb', 'suo', 'xcf', 'raj', 'cert', 'ks', 'ts', 'odp',
+  ]
+
+_exec_exts = [
+    'exe', 'dll', 'lib', 'so', 'a', 'exp',
+  ]
+
+_keystore_exts = [
+    'jks', 'keystore', 'pem', 'crl',
+  ]
+
+_image_exts = [
+    'png', 'pdf', 'gif', 'giff', 'tif', 'tiff', 'jpg', 'jpeg', 'ico', 'icns',
+  ]
+
+_bytecode_exts = [
+    'class', 'pyd', 'obj', 'pyc',
+  ]
+
+_binary_exts = _data_exts + _exec_exts + _keystore_exts + _image_exts + \
+               _bytecode_exts
+
+
+def is_binary(item):
+  '''Entry method, will return True if ITEM is thought to be binary,
+     False otherwise.'''
+
+  # First, try the mime-type
+  (type, encoding) = mimetypes.guess_type(item.name)
+  print '%s : %s ' % (item.name, type)
+  if type and type.split('/')[0] in ('image', 'application'):
+    return True
+
+  # Now, manually look at file extensions
+  (root, ext) = os.path.splitext(item.name)
+  if ext[1:] in _binary_exts:
+    return True
+
+  # Special case: we still think is is binary if it contains an executable
+  # extension in the filename, not just at the end
+  for ext in _exec_exts:
+    if ('.' + ext + '.') in item.name:
+      return True
+
+  # Time to attempt a brute-force divination

Modified: labs/mouse/tests/test_mouse.py
URL: http://svn.apache.org/viewvc/labs/mouse/tests/test_mouse.py?rev=982108&r1=982107&r2=982108&view=diff
==============================================================================
--- labs/mouse/tests/test_mouse.py (original)
+++ labs/mouse/tests/test_mouse.py Wed Aug  4 03:08:03 2010
@@ -32,6 +32,7 @@ sys.path.append(os.path.dirname(sys.path
 
 import mouse
 import sources
+import guesser
 
 data_path = 'data'
 resources_path = os.path.join(os.path.dirname(sys.path[0]), 'resources')
@@ -128,6 +129,22 @@ class TestReport(unittest.TestCase):
 #    self._check_plain_output('rat-tests', 'rat-tests')
 
 
+class TestBinaryGuessing(unittest.TestCase):
+  '''Test the various ways we tell if something is a binary file.
+     This set of tests is largely stolen from RAT's testsuite.'''
+
+  _names = [ 'image.png', 'image.pdf', 'image.gif', 'image.giff', 'image.tif',
+    'image.tiff', 'image.jpg', 'image.jpeg', 'image.exe', 'Whatever.class',
+    'data.dat', 'libicudata.so.34.' ]
+
+  def test_is_binary(self):
+    for name in self._names:
+      # Don't bother giving the item contents, since we shouldn't ever
+      # have to check the content, anyway
+      self.assertTrue(guesser.is_binary(sources.Item(name, None)))
+
+
+
 class TestFilters(unittest.TestCase):
   'Test filtering various files and patterns'
 



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org