You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ms...@apache.org on 2015/10/02 08:01:02 UTC
svn commit: r1706348 -
/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java
Author: msahyoun
Date: Fri Oct 2 06:01:02 2015
New Revision: 1706348
URL: http://svn.apache.org/viewvc?rev=1706348&view=rev
Log:
PDFBOX-2252: revised unit test to deal with platform differences
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java?rev=1706348&r1=1706347&r2=1706348&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/BidiTest.java Fri Oct 2 06:01:02 2015
@@ -17,15 +17,20 @@
package org.apache.pdfbox.text;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
-import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.Reader;
+import java.io.LineNumberReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.After;
import org.junit.Before;
@@ -37,19 +42,33 @@ import org.junit.Test;
*/
public class BidiTest
{
+ /**
+ * Logger instance.
+ */
+ private static final Log log = LogFactory.getLog(TestTextStripper.class);
+
private static final File IN_DIR = new File("src/test/resources/org/apache/pdfbox/text/");
+ private static final File outDir = new File("target/test-output");
private static final String NAME_OF_PDF = "BidiSample.pdf";
- private static final String NAME_OF_TXT = "BidiSample.pdf.txt";
- private static final String NAME_OF_TXT_SORTED = "BidiSample.pdf-sorted.txt";
private static final String ENCODING = "UTF-8";
private PDDocument document;
private PDFTextStripper stripper;
+
+ private boolean bFail = false;
@Before
public void setUp() throws IOException
{
+ if (!outDir.exists())
+ {
+ if (!outDir.mkdirs())
+ {
+ throw (new IOException("Error creating " + outDir.getAbsolutePath() + " directory"));
+ }
+ }
+
document = PDDocument.load(new File(IN_DIR, NAME_OF_PDF));
stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
@@ -58,58 +77,227 @@ public class BidiTest
@Test
public void testSorted() throws IOException
{
- stripper.setSortByPosition(true);
- String extractedText = stripper.getText(document);
-
- Reader compareTextReader = new InputStreamReader(new FileInputStream(new File(IN_DIR, NAME_OF_TXT_SORTED)), ENCODING);;
- BufferedReader bufferedCompareTextReader = new BufferedReader(compareTextReader);
-
- StringBuilder compareTextBuilder = new StringBuilder();
-
- String line = bufferedCompareTextReader.readLine();
-
- while (line != null)
- {
- compareTextBuilder.append(line);
- compareTextBuilder.append('\n');
- line = bufferedCompareTextReader.readLine();
- }
-
- bufferedCompareTextReader.close();
-
- assertEquals(extractedText, compareTextBuilder.toString());
-
+ File testFile = new File(IN_DIR, NAME_OF_PDF);
+ doTestFile(testFile, outDir, false, true);
}
@Test
public void testNotSorted() throws IOException
{
- stripper.setSortByPosition(false);
- String extractedText = stripper.getText(document);
-
- Reader compareTextReader = new InputStreamReader(new FileInputStream(new File(IN_DIR, NAME_OF_TXT)), ENCODING);;
- BufferedReader bufferedCompareTextReader = new BufferedReader(compareTextReader);
-
- StringBuilder compareTextBuilder = new StringBuilder();
- String line = bufferedCompareTextReader.readLine();
+ File testFile = new File(IN_DIR, NAME_OF_PDF);
+ doTestFile(testFile, outDir, false, false);
+ }
- while (line != null)
+ @After
+ public void tearDown() throws IOException
+ {
+ document.close();
+ }
+
+ /**
+ * Validate text extraction on a single file.
+ *
+ * @param inFile The PDF file to validate
+ * @param outDir The directory to store the output in
+ * @param bLogResult Whether to log the extracted text
+ * @param bSort Whether or not the extracted text is sorted
+ * @throws Exception when there is an exception
+ */
+ public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort)
+ throws IOException
+ {
+ if(bSort)
{
- compareTextBuilder.append(line);
- compareTextBuilder.append('\n');
- line = bufferedCompareTextReader.readLine();
+ log.info("Preparing to parse " + inFile.getName() + " for sorted test");
+ }
+ else
+ {
+ log.info("Preparing to parse " + inFile.getName() + " for standard test");
}
- bufferedCompareTextReader.close();
-
- assertEquals(extractedText, compareTextBuilder.toString());
+ if (!outDir.exists())
+ {
+ if (!outDir.mkdirs())
+ {
+ throw (new IOException("Error creating " + outDir.getAbsolutePath() + " directory"));
+ }
+ }
+ PDDocument document = PDDocument.load(inFile);
+ try
+ {
+ File outFile;
+ File expectedFile;
+
+ if(bSort)
+ {
+ outFile = new File(outDir, inFile.getName() + "-sorted.txt");
+ expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
+ }
+ else
+ {
+ outFile = new File(outDir, inFile.getName() + ".txt");
+ expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
+ }
+
+ OutputStream os = new FileOutputStream(outFile);
+ try
+ {
+ Writer writer = new OutputStreamWriter(os, ENCODING);
+ try
+ {
+ //Allows for sorted tests
+ stripper.setSortByPosition(bSort);
+ stripper.writeText(document, writer);
+ }
+ finally
+ {
+ // close the written file before reading it again
+ writer.close();
+ }
+ }
+ finally
+ {
+ os.close();
+ }
+
+ if (bLogResult)
+ {
+ log.info("Text for " + inFile.getName() + ":");
+ log.info(stripper.getText(document));
+ }
+
+ if (!expectedFile.exists())
+ {
+ this.bFail = true;
+ fail("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() +
+ " did not exist");
+ return;
+ }
+
+ LineNumberReader expectedReader =
+ new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
+ LineNumberReader actualReader =
+ new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING));
+
+ while (true)
+ {
+ String expectedLine = expectedReader.readLine();
+ while( expectedLine != null && expectedLine.trim().length() == 0 )
+ {
+ expectedLine = expectedReader.readLine();
+ }
+ String actualLine = actualReader.readLine();
+ while( actualLine != null && actualLine.trim().length() == 0 )
+ {
+ actualLine = actualReader.readLine();
+ }
+ if (!stringsEqual(expectedLine, actualLine))
+ {
+ this.bFail = true;
+ fail("FAILURE: Line mismatch for file " + inFile.getName() +
+ " (sort = "+bSort+")" +
+ " at expected line: " + expectedReader.getLineNumber() +
+ " at actual line: " + actualReader.getLineNumber() +
+ "\nexpected line was: \"" + expectedLine + "\"" +
+ "\nactual line was: \"" + actualLine + "\"" + "\n");
+
+ //lets report all lines, even though this might produce some verbose logging
+ //break;
+ }
+
+ if( expectedLine == null || actualLine==null)
+ {
+ break;
+ }
+ }
+ expectedReader.close();
+ actualReader.close();
+ }
+ finally
+ {
+ document.close();
+ }
+ }
+
+ /**
+ * Determine whether two strings are equal, where two null strings are
+ * considered equal.
+ *
+ * @param expected Expected string
+ * @param actual Actual String
+ * @return <code>true</code> is the strings are both null,
+ * or if their contents are the same, otherwise <code>false</code>.
+ */
+ private boolean stringsEqual(String expected, String actual)
+ {
+ boolean equals = true;
+ if( (expected == null) && (actual == null) )
+ {
+ return true;
+ }
+ else if( expected != null && actual != null )
+ {
+ expected = expected.trim();
+ actual = actual.trim();
+ char[] expectedArray = expected.toCharArray();
+ char[] actualArray = actual.toCharArray();
+ int expectedIndex = 0;
+ int actualIndex = 0;
+ while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
+ {
+ if( expectedArray[expectedIndex] != actualArray[actualIndex] )
+ {
+ equals = false;
+ log.warn("Lines differ at index"
+ + " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex]
+ + " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] );
+ break;
+ }
+ expectedIndex = skipWhitespace( expectedArray, expectedIndex );
+ actualIndex = skipWhitespace( actualArray, actualIndex );
+ expectedIndex++;
+ actualIndex++;
+ }
+ if( equals )
+ {
+ if( expectedIndex != expectedArray.length )
+ {
+ equals = false;
+ log.warn("Expected line is longer at:" + expectedIndex );
+ }
+ if( actualIndex != actualArray.length )
+ {
+ equals = false;
+ log.warn("Actual line is longer at:" + actualIndex );
+ }
+ }
+ }
+ else
+ {
+ equals = (expected == null && actual != null && actual.trim().isEmpty())
+ || (actual == null && expected != null && expected.trim().isEmpty());
+ }
+ return equals;
}
- @After
- public void tearDown() throws IOException
+ /**
+ * If the current index is whitespace then skip any subsequent whitespace.
+ */
+ private int skipWhitespace( char[] array, int index )
{
- document.close();
+ //if we are at a space character then skip all space
+ //characters, but when all done rollback 1 because stringsEqual
+ //will roll forward 1
+ if( array[index] == ' ' || array[index] > 256 )
+ {
+ while( index < array.length && (array[index] == ' ' || array[index] > 256))
+ {
+ index++;
+ }
+ index--;
+ }
+ return index;
}
}