You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/07/05 09:08:46 UTC
[nutch] branch master updated: Fix for NUTCH-2397 (improved
solution contributed by Vipul Behl,
closes #196): - do not add superfluous line breaks and space - fix units
tests - also fix parse-html in addition to parse-tika
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 48c38b0 Fix for NUTCH-2397 (improved solution contributed by Vipul Behl, closes #196): - do not add superfluous line breaks and space - fix units tests - also fix parse-html in addition to parse-tika
48c38b0 is described below
commit 48c38b03f3cfb73402431f262990a6d091570e9a
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jul 4 14:25:44 2017 +0200
Fix for NUTCH-2397 (improved solution contributed by Vipul Behl, closes #196):
- do not add superfluous line breaks and space
- fix units tests
- also fix parse-html in addition to parse-tika
---
.../apache/nutch/parse/html/DOMContentUtils.java | 47 +++++++++++++++++++++-
.../apache/nutch/parse/tika/DOMContentUtils.java | 47 ++++++++++++++++++++--
.../org/apache/nutch/parse/zip/TestZipParser.java | 7 +++-
3 files changed, 94 insertions(+), 7 deletions(-)
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 3c2aba0..0d6b2a9 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -157,9 +157,10 @@ public class DOMContentUtils {
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
- if (sb.length() > 0)
- sb.append(' ');
+ appendSpace(sb);
sb.append(text);
+ } else {
+ appendParagraphSeparator(sb);
}
}
}
@@ -168,6 +169,48 @@ public class DOMContentUtils {
}
/**
+ * Conditionally append a paragraph/line break to StringBuffer unless last
+ * character a already indicates a paragraph break. Also remove trailing space
+ * before paragraph break.
+ *
+ * @param buffer
+ * StringBuffer to append paragraph break
+ */
+ private void appendParagraphSeparator(StringBuffer buffer) {
+ if (buffer.length() == 0) {
+ return;
+ }
+ char lastChar = buffer.charAt(buffer.length() - 1);
+ if ('\n' != lastChar) {
+ // remove white space before paragraph break
+ while (lastChar == ' ') {
+ buffer.deleteCharAt(buffer.length() - 1);
+ lastChar = buffer.charAt(buffer.length() - 1);
+ }
+ if ('\n' != lastChar) {
+ buffer.append('\n');
+ }
+ }
+ }
+
+ /**
+ * Conditionally append a space to StringBuffer unless last character is a
+ * space or line/paragraph break.
+ *
+ * @param buffer
+ * StringBuffer to append space
+ */
+ private void appendSpace(StringBuffer buffer) {
+ if (buffer.length() == 0) {
+ return;
+ }
+ char lastChar = buffer.charAt(buffer.length() - 1);
+ if (' ' != lastChar && '\n' != lastChar) {
+ buffer.append(' ');
+ }
+ }
+
+ /**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index b820e0c..a5b06c7 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -162,11 +162,10 @@ public class DOMContentUtils {
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
- if (sb.length() > 0)
- sb.append(' ');
+ appendSpace(sb);
sb.append(text);
} else {
- sb.append("\n");
+ appendParagraphSeparator(sb);
}
}
}
@@ -175,6 +174,48 @@ public class DOMContentUtils {
}
/**
+ * Conditionally append a paragraph/line break to StringBuffer unless last
+ * character a already indicates a paragraph break. Also remove trailing space
+ * before paragraph break.
+ *
+ * @param buffer
+ * StringBuffer to append paragraph break
+ */
+ private void appendParagraphSeparator(StringBuffer buffer) {
+ if (buffer.length() == 0) {
+ return;
+ }
+ char lastChar = buffer.charAt(buffer.length() - 1);
+ if ('\n' != lastChar) {
+ // remove white space before paragraph break
+ while (lastChar == ' ') {
+ buffer.deleteCharAt(buffer.length() - 1);
+ lastChar = buffer.charAt(buffer.length() - 1);
+ }
+ if ('\n' != lastChar) {
+ buffer.append('\n');
+ }
+ }
+ }
+
+ /**
+ * Conditionally append a space to StringBuffer unless last character is a
+ * space or line/paragraph break.
+ *
+ * @param buffer
+ * StringBuffer to append space
+ */
+ private void appendSpace(StringBuffer buffer) {
+ if (buffer.length() == 0) {
+ return;
+ }
+ char lastChar = buffer.charAt(buffer.length() - 1);
+ if (' ' != lastChar && '\n' != lastChar) {
+ buffer.append(' ');
+ }
+ }
+
+ /**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
diff --git a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
index 17e386a..bbb0866 100644
--- a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
+++ b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
@@ -46,7 +46,7 @@ public class TestZipParser {
private String[] sampleFiles = { "test.zip" };
- private String expectedText = "textfile.txt This is text file number 1 ";
+ private String expectedText = "textfile.txt This is text file number 1";
@Test
public void testIt() throws ProtocolException, ParseException {
@@ -64,7 +64,10 @@ public class TestZipParser {
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
content.getUrl());
- Assert.assertTrue(parse.getText().equals(expectedText));
+ Assert.assertTrue(
+ "Extracted text does not start with <" + expectedText + ">: <"
+ + parse.getText() + ">",
+ parse.getText().startsWith(expectedText));
}
}
--
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].