You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/02/25 07:47:13 UTC
svn commit: r1662171 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/strings/
test/java/org/apache/tika/parser/strings/
Author: mattmann
Date: Wed Feb 25 06:47:13 2015
New Revision: 1662171
URL: http://svn.apache.org/r1662171
Log:
Updated tests for TIKA-1541 simple strings parser from Guiseppe Totaro.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
import java.io.File;
import java.io.Serializable;
+import java.util.Properties;
+import java.io.InputStream;
+import java.io.IOException;
/**
* Configuration for the "strings" (or strings-alternative) command.
@@ -27,10 +30,10 @@ public class StringsConfig implements Se
private static final long serialVersionUID = -1465227101645003594L;
private String stringsPath = "";
-
+
// Minimum sequence length (characters) to print
private int minLength = 4;
-
+
// Character encoding of the strings that are to be found
private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT;
@@ -38,10 +41,57 @@ public class StringsConfig implements Se
private int timeout = 120;
/**
- * Default constructor.
+ * Default contructor.
*/
public StringsConfig() {
- // TODO Loads properties from InputStream.
+ init(this.getClass().getResourceAsStream("Strings.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close InputStream. If
+ * there is an IOException, this silently swallows the exception and goes
+ * back to the default.
+ *
+ * @param is
+ */
+ public StringsConfig(InputStream is) {
+ init(is);
+ }
+
+ /**
+ * Initializes attributes.
+ *
+ * @param is
+ */
+ private void init(InputStream is) {
+ if (is == null) {
+ return;
+ }
+ Properties props = new Properties();
+ try {
+ props.load(is);
+ } catch (IOException e) {
+ // swallow
+ } finally {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ // swallow
+ }
+ }
+ }
+
+ setStringsPath(props.getProperty("stringsPath", "" + getStringsPath()));
+
+ setMinLength(Integer.parseInt(props.getProperty("minLength", ""
+ + getMinLength())));
+
+ setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", ""
+ + getEncoding().get())));
+
+ setTimeout(Integer.parseInt(props.getProperty("timeout", ""
+ + getTimeout())));
}
/**
@@ -52,7 +102,7 @@ public class StringsConfig implements Se
public String getStringsPath() {
return this.stringsPath;
}
-
+
/**
* Returns the minimum sequence length (characters) to print.
*
@@ -61,11 +111,12 @@ public class StringsConfig implements Se
public int getMinLength() {
return this.minLength;
}
-
+
/**
* Returns the character encoding of the strings that are to be found.
*
- * @return {@see StringsEncoding} enum that represents the character encoding of the strings that are to be found.
+ * @return {@see StringsEncoding} enum that represents the character
+ * encoding of the strings that are to be found.
*/
public StringsEncoding getEncoding() {
return this.encoding;
@@ -85,40 +136,52 @@ public class StringsConfig implements Se
/**
* Sets the "strings" installation folder.
*
- * @param path the "strings" installation folder.
+ * @param path
+ * the "strings" installation folder.
*/
public void setStringsPath(String path) {
- char lastChar = path.charAt(path.length() - 1);
-
- if (lastChar != File.separatorChar) {
+ if (!path.isEmpty() && !path.endsWith(File.separator)) {
path += File.separatorChar;
}
this.stringsPath = path;
}
-
+
/**
* Sets the minimum sequence length (characters) to print.
*
- * @param minLength the minimum sequence length (characters) to print.
+ * @param minLength
+ * the minimum sequence length (characters) to print.
*/
public void setMinLength(int minLength) {
+ if (minLength < 1) {
+ throw new IllegalArgumentException("Invalid minimum length");
+ }
this.minLength = minLength;
}
-
+
/**
* Sets the character encoding of the strings that are to be found.
*
- * @param encoding {@see StringsEncoding} enum that represents the character encoding of the strings that are to be found.
+ * @param encoding
+ * {@see StringsEncoding} enum that represents the character
+ * encoding of the strings that are to be found.
*/
- public void setEncodings(StringsEncoding encoding) {
+ public void setEncoding(StringsEncoding encoding) {
this.encoding = encoding;
}
/**
- * Sets the maximum time (in seconds) to wait for the "strings" command to terminate.
- * @param timeout the maximum time (in seconds) to wait for the "strings" command to terminate.
+ * Sets the maximum time (in seconds) to wait for the "strings" command to
+ * terminate.
+ *
+ * @param timeout
+ * the maximum time (in seconds) to wait for the "strings"
+ * command to terminate.
*/
public void setTimeout(int timeout) {
+ if (timeout < 1) {
+ throw new IllegalArgumentException("Invalid timeout");
+ }
this.timeout = timeout;
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java Wed Feb 25 06:47:13 2015
@@ -18,7 +18,7 @@ package org.apache.tika.parser.strings;
*
*/
public enum StringsEncoding {
- SINGLE_7_BIT('s', "single-7-bit-byte"),
+ SINGLE_7_BIT('s', "single-7-bit-byte"), // default
SINGLE_8_BIT('S', "single-8-bit-byte"),
BIGENDIAN_16_BIT('b', "16-bit bigendian"),
LITTLEENDIAN_16_BIT('l', "16-bit littleendian"),
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java Wed Feb 25 06:47:13 2015
@@ -63,7 +63,11 @@ public class StringsParser extends Abstr
private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
- // String -> Boolean[2] (0 -> is_present. 1 -> supports_encoding)
+ /*
+ * This map is organized as follows:
+ * command's pathname (String) -> is it present? (Boolean), does it support -e option? (Boolean)
+ * It stores check results for command and, if present, -e (encoding) option.
+ */
private static Map<String,Boolean[]> STRINGS_PRESENT = new HashMap<String, Boolean[]>();
@Override
@@ -121,24 +125,32 @@ public class StringsParser extends Abstr
}
String[] checkCmd = { stringsProg, "--version" };
+ try {
+ boolean hasStrings = ExternalParser.check(checkCmd);
- boolean hasStrings = ExternalParser.check(checkCmd);
-
- boolean encodingOpt = false;
-
- // Check if the -e option (encoding) is supported
- if (!System.getProperty("os.name").startsWith("Windows")) {
- String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
- int[] errorValues = {1, 2}; // 1: General error. 2: Incorrect usage.
- encodingOpt = ExternalParser.check(checkOpt, errorValues);
- }
+ boolean encodingOpt = false;
+
+ // Check if the -e option (encoding) is supported
+ if (!System.getProperty("os.name").startsWith("Windows")) {
+ String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
+ int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage.
+ encodingOpt = ExternalParser.check(checkOpt, errorValues);
+ }
- Boolean[] values = {hasStrings, encodingOpt};
- STRINGS_PRESENT.put(stringsProg, values);
+ Boolean[] values = {hasStrings, encodingOpt};
+ STRINGS_PRESENT.put(stringsProg, values);
- return hasStrings;
+ return hasStrings;
+ } catch (NoClassDefFoundError ncdfe) {
+ // This happens under OSGi + Fork Parser - see TIKA-1507
+ // As a workaround for now, just say we can't use strings
+ // TODO Resolve it so we don't need this try/catch block
+ Boolean[] values = {false, false};
+ STRINGS_PRESENT.put(stringsProg, values);
+ return false;
+ }
}
-
+
/**
* Checks if the "file" command is supported.
*
@@ -183,7 +195,7 @@ public class StringsParser extends Abstr
cmdList.add(stringsProg);
cmdList.add("-n");
cmdList.add("" + config.getMinLength());;
- // encoding option is not supported by windows version
+ // Currently, encoding option is not supported by Windows (and other) versions
if (STRINGS_PRESENT.get(stringsProg)[1]) {
cmdList.add("-e");
cmdList.add("" + config.getEncoding().get());
@@ -191,7 +203,7 @@ public class StringsParser extends Abstr
cmdList.add(input.getPath());
String[] cmd = cmdList.toArray(new String[cmdList.size()]);
-
+
ProcessBuilder pb = new ProcessBuilder(cmd);
final Process process = pb.start();
@@ -312,10 +324,8 @@ public class StringsParser extends Abstr
fileOutput = reader.readLine();
} catch (IOException ioe) {
- // TODO
- System.err
- .println("An error occurred in reading output of the file command: "
- + ioe.getMessage());
+ // file output not available!
+ fileOutput = "";
} finally {
reader.close();
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
import static org.junit.Assert.*;
+import java.io.File;
+import java.io.InputStream;
+
import org.junit.Test;
public class StringsConfigTest {
@@ -27,4 +30,32 @@ public class StringsConfigTest {
assertEquals("Invalid default min-len value", 4, config.getMinLength());
assertEquals("Invalid default timeout value", 120, config.getTimeout());
}
+
+ @Test
+ public void testPartialConfig() {
+ InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties");
+
+ StringsConfig config = new StringsConfig(stream);
+ assertEquals("Invalid default stringsPath value", "", config.getStringsPath());
+ assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+ assertEquals("Invalid default min-len value", 4, config.getMinLength());
+ assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+ }
+
+ @Test
+ public void testFullConfig() {
+ InputStream stream = StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties");
+
+ StringsConfig config = new StringsConfig(stream);
+ assertEquals("Invalid overridden stringsPath value", "/opt/strings" + File.separator, config.getStringsPath());
+ assertEquals("Invalid overridden encoding value", StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+ assertEquals("Invalid overridden min-len value", 3, config.getMinLength());
+ assertEquals("Invalid overridden timeout value", 60, config.getTimeout());
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidateEconding() {
+ StringsConfig config = new StringsConfig();
+ config.setMinLength(0);
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java Wed Feb 25 06:47:13 2015
@@ -18,6 +18,7 @@ import static org.junit.Assert.*;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
+import java.util.Arrays;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -42,12 +43,15 @@ public class StringsParserTest {
String resource = "/test-documents/testOCTET_header.dbase3";
String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" };
+
+ String[] met_attributes = {"min-len", "encoding", "strings:file_output"};
StringsConfig stringsConfig = new StringsConfig();
FileConfig fileConfig = new FileConfig();
Parser parser = new StringsParser();
ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(StringsConfig.class, stringsConfig);
@@ -56,15 +60,19 @@ public class StringsParserTest {
InputStream stream = StringsParserTest.class.getResourceAsStream(resource);
try {
- parser.parse(stream, handler, new Metadata(), context);
+ parser.parse(stream, handler, metadata, context);
} catch (Exception e) {
e.printStackTrace();
} finally {
stream.close();
}
-
+
+ // Content
for (String word : content) {
assertTrue(handler.toString().contains(word));
}
+
+ // Metadata
+ Arrays.equals(met_attributes, metadata.names());
}
}