You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/02/25 07:47:13 UTC

svn commit: r1662171 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/strings/ test/java/org/apache/tika/parser/strings/

Author: mattmann
Date: Wed Feb 25 06:47:13 2015
New Revision: 1662171

URL: http://svn.apache.org/r1662171
Log:
Updated tests for TIKA-1541 simple strings parser from Guiseppe Totaro.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
 
 import java.io.File;
 import java.io.Serializable;
+import java.util.Properties;
+import java.io.InputStream;
+import java.io.IOException;
 
 /**
  * Configuration for the "strings" (or strings-alternative) command.
@@ -27,10 +30,10 @@ public class StringsConfig implements Se
 	private static final long serialVersionUID = -1465227101645003594L;
 
 	private String stringsPath = "";
-	
+
 	// Minimum sequence length (characters) to print
 	private int minLength = 4;
-	
+
 	// Character encoding of the strings that are to be found
 	private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT;
 
@@ -38,10 +41,57 @@ public class StringsConfig implements Se
 	private int timeout = 120;
 
 	/**
-	 * Default constructor.
+	 * Default contructor.
 	 */
 	public StringsConfig() {
-		// TODO Loads properties from InputStream.
+		init(this.getClass().getResourceAsStream("Strings.properties"));
+	}
+
+	/**
+	 * Loads properties from InputStream and then tries to close InputStream. If
+	 * there is an IOException, this silently swallows the exception and goes
+	 * back to the default.
+	 *
+	 * @param is
+	 */
+	public StringsConfig(InputStream is) {
+		init(is);
+	}
+
+	/**
+	 * Initializes attributes.
+	 *
+	 * @param is
+	 */
+	private void init(InputStream is) {
+		if (is == null) {
+			return;
+		}
+		Properties props = new Properties();
+		try {
+			props.load(is);
+		} catch (IOException e) {
+			// swallow
+		} finally {
+			if (is != null) {
+				try {
+					is.close();
+				} catch (IOException e) {
+					// swallow
+				}
+			}
+		}
+
+		setStringsPath(props.getProperty("stringsPath", "" + getStringsPath()));
+		
+		setMinLength(Integer.parseInt(props.getProperty("minLength", ""
+				+ getMinLength())));
+
+		setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", ""
+				+ getEncoding().get())));
+
+		setTimeout(Integer.parseInt(props.getProperty("timeout", ""
+				+ getTimeout())));
 	}
 
 	/**
@@ -52,7 +102,7 @@ public class StringsConfig implements Se
 	public String getStringsPath() {
 		return this.stringsPath;
 	}
-	
+
 	/**
 	 * Returns the minimum sequence length (characters) to print.
 	 * 
@@ -61,11 +111,12 @@ public class StringsConfig implements Se
 	public int getMinLength() {
 		return this.minLength;
 	}
-	
+
 	/**
 	 * Returns the character encoding of the strings that are to be found.
 	 * 
-	 * @return {@see StringsEncoding} enum that represents the character encoding of the strings that are to be found.
+	 * @return {@see StringsEncoding} enum that represents the character
+	 *         encoding of the strings that are to be found.
 	 */
 	public StringsEncoding getEncoding() {
 		return this.encoding;
@@ -85,40 +136,52 @@ public class StringsConfig implements Se
 	/**
 	 * Sets the "strings" installation folder.
 	 * 
-	 * @param path the "strings" installation folder.
+	 * @param path
+	 *            the "strings" installation folder.
 	 */
 	public void setStringsPath(String path) {
-		char lastChar = path.charAt(path.length() - 1);
-
-		if (lastChar != File.separatorChar) {
+		if (!path.isEmpty() && !path.endsWith(File.separator)) {
 			path += File.separatorChar;
 		}
 		this.stringsPath = path;
 	}
-	
+
 	/**
 	 * Sets the minimum sequence length (characters) to print.
 	 * 
-	 * @param minLength the minimum sequence length (characters) to print.
+	 * @param minLength
+	 *            the minimum sequence length (characters) to print.
 	 */
 	public void setMinLength(int minLength) {
+		if (minLength < 1) {
+			throw new IllegalArgumentException("Invalid minimum length");
+		}
 		this.minLength = minLength;
 	}
-	
+
 	/**
 	 * Sets the character encoding of the strings that are to be found.
 	 * 
-	 * @param encoding {@see StringsEncoding} enum that represents the character encoding of the strings that are to be found.
+	 * @param encoding
+	 *            {@see StringsEncoding} enum that represents the character
+	 *            encoding of the strings that are to be found.
 	 */
-	public void setEncodings(StringsEncoding encoding) {
+	public void setEncoding(StringsEncoding encoding) {
 		this.encoding = encoding;
 	}
 
 	/**
-	 * Sets the maximum time (in seconds) to wait for the "strings" command to terminate.
-	 * @param timeout the maximum time (in seconds) to wait for the "strings" command to terminate.
+	 * Sets the maximum time (in seconds) to wait for the "strings" command to
+	 * terminate.
+	 * 
+	 * @param timeout
+	 *            the maximum time (in seconds) to wait for the "strings"
+	 *            command to terminate.
 	 */
 	public void setTimeout(int timeout) {
+		if (timeout < 1) {
+			throw new IllegalArgumentException("Invalid timeout");
+		}
 		this.timeout = timeout;
 	}
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java Wed Feb 25 06:47:13 2015
@@ -18,7 +18,7 @@ package org.apache.tika.parser.strings;
  *
  */
 public enum StringsEncoding {
-	SINGLE_7_BIT('s', "single-7-bit-byte"),
+	SINGLE_7_BIT('s', "single-7-bit-byte"), // default
 	SINGLE_8_BIT('S', "single-8-bit-byte"),
 	BIGENDIAN_16_BIT('b', "16-bit bigendian"),
 	LITTLEENDIAN_16_BIT('l', "16-bit littleendian"),

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java Wed Feb 25 06:47:13 2015
@@ -63,7 +63,11 @@ public class StringsParser extends Abstr
 	
 	private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
 	
-	// String -> Boolean[2] (0 -> is_present. 1 -> supports_encoding)
+	/*
+	 * This map is organized as follows:
+	 * command's pathname (String) -> is it present? (Boolean), does it support -e option? (Boolean)
+	 * It stores check results for command and, if present, -e (encoding) option.
+	 */
 	private static Map<String,Boolean[]> STRINGS_PRESENT = new HashMap<String, Boolean[]>();
 
 	@Override
@@ -121,24 +125,32 @@ public class StringsParser extends Abstr
 		}
 
 		String[] checkCmd = { stringsProg, "--version" };
+		try {
+			boolean hasStrings = ExternalParser.check(checkCmd);
 
-		boolean hasStrings = ExternalParser.check(checkCmd);
-		
-		boolean encodingOpt = false;
-		
-		// Check if the -e option (encoding) is supported
-		if (!System.getProperty("os.name").startsWith("Windows")) {
-			String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
-			int[] errorValues = {1, 2}; // 1: General error. 2: Incorrect usage.
-			encodingOpt = ExternalParser.check(checkOpt, errorValues);
-		}
+			boolean encodingOpt = false;
+
+			// Check if the -e option (encoding) is supported
+			if (!System.getProperty("os.name").startsWith("Windows")) {
+				String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
+				int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage.
+				encodingOpt = ExternalParser.check(checkOpt, errorValues);
+			}
 		
-		Boolean[] values = {hasStrings, encodingOpt};
-		STRINGS_PRESENT.put(stringsProg, values);
+			Boolean[] values = {hasStrings, encodingOpt};
+			STRINGS_PRESENT.put(stringsProg, values);
 
-		return hasStrings;
+			return hasStrings;
+		} catch (NoClassDefFoundError ncdfe) {
+			// This happens under OSGi + Fork Parser - see TIKA-1507
+			// As a workaround for now, just say we can't use strings
+			// TODO Resolve it so we don't need this try/catch block
+			Boolean[] values = {false, false};
+			STRINGS_PRESENT.put(stringsProg, values);
+			return false;
+		}
 	}
-	
+
 	/**
 	 * Checks if the "file" command is supported.
 	 * 
@@ -183,7 +195,7 @@ public class StringsParser extends Abstr
 		cmdList.add(stringsProg);
 		cmdList.add("-n");
 		cmdList.add("" + config.getMinLength());;
-		// encoding option is not supported by windows version
+		// Currently, encoding option is not supported by Windows (and other) versions
 		if (STRINGS_PRESENT.get(stringsProg)[1]) {
 			cmdList.add("-e");
 			cmdList.add("" + config.getEncoding().get());
@@ -191,7 +203,7 @@ public class StringsParser extends Abstr
 		cmdList.add(input.getPath());
 		
 		String[] cmd = cmdList.toArray(new String[cmdList.size()]);
-
+		
 		ProcessBuilder pb = new ProcessBuilder(cmd);
 		final Process process = pb.start();
 
@@ -312,10 +324,8 @@ public class StringsParser extends Abstr
 			fileOutput = reader.readLine();
 
 		} catch (IOException ioe) {
-			// TODO
-			System.err
-					.println("An error occurred in reading output of the file command: "
-							+ ioe.getMessage());
+			// file output not available!
+			fileOutput = "";
 		} finally {
 			reader.close();
 		}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
 
 import static org.junit.Assert.*;
 
+import java.io.File;
+import java.io.InputStream;
+
 import org.junit.Test;
 
 public class StringsConfigTest {
@@ -27,4 +30,32 @@ public class StringsConfigTest {
 		assertEquals("Invalid default min-len value", 4, config.getMinLength());
 		assertEquals("Invalid default timeout value", 120, config.getTimeout());
 	}
+	
+	@Test
+	public void testPartialConfig() {
+		InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties");
+		
+		StringsConfig config = new StringsConfig(stream);
+		assertEquals("Invalid default stringsPath value", "", config.getStringsPath());
+		assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+		assertEquals("Invalid default min-len value", 4, config.getMinLength());
+		assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+	}
+	
+	@Test
+	public void testFullConfig() {
+		InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties");
+		
+		StringsConfig config = new StringsConfig(stream);
+		assertEquals("Invalid overridden stringsPath value", "/opt/strings" + File.separator, config.getStringsPath());
+		assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+		assertEquals("Invalid overridden min-len value", 3, config.getMinLength());
+		assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+	}
+	
+	@Test(expected=IllegalArgumentException.class)
+	public void testValidateEconding() {
+		StringsConfig config = new StringsConfig();
+		config.setMinLength(0);
+	}
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java Wed Feb 25 06:47:13 2015
@@ -18,6 +18,7 @@ import static org.junit.Assert.*;
 import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
+import java.util.Arrays;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -42,12 +43,15 @@ public class StringsParserTest {
 		String resource = "/test-documents/testOCTET_header.dbase3";
 
 		String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" };
+		
+		String[] met_attributes = {"min-len", "encoding", "strings:file_output"};
 
 		StringsConfig stringsConfig = new StringsConfig();
 		FileConfig fileConfig = new FileConfig();
 
 		Parser parser = new StringsParser();
 		ContentHandler handler = new BodyContentHandler();
+		Metadata metadata = new Metadata();
 
 		ParseContext context = new ParseContext();
 		context.set(StringsConfig.class, stringsConfig);
@@ -56,15 +60,19 @@ public class StringsParserTest {
 		InputStream stream = StringsParserTest.class.getResourceAsStream(resource);
 
 		try {
-			parser.parse(stream, handler, new Metadata(), context);
+			parser.parse(stream, handler, metadata, context);
 		} catch (Exception e) {
 			e.printStackTrace();
 		} finally {
 			stream.close();
 		}
-		
+
+		// Content
 		for (String word : content) {
 			assertTrue(handler.toString().contains(word));
 		}
+		
+		// Metadata
+		Arrays.equals(met_attributes, metadata.names());
 	}
 }