You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/07/05 09:08:46 UTC
[nutch] branch master updated: Fix for NUTCH-2397 (improved solution contributed by Vipul Behl, closes #196): - do not add superfluous line breaks and space - fix units tests - also fix parse-html in addition to parse-tika

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 48c38b0  Fix for NUTCH-2397 (improved solution contributed by Vipul Behl, closes #196): - do not add superfluous line breaks and space - fix units tests - also fix parse-html in addition to parse-tika
48c38b0 is described below

commit 48c38b03f3cfb73402431f262990a6d091570e9a
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jul 4 14:25:44 2017 +0200

    Fix for NUTCH-2397 (improved solution contributed by Vipul Behl, closes #196):
    - do not add superfluous line breaks and space
    - fix units tests
    - also fix parse-html in addition to parse-tika
---
 .../apache/nutch/parse/html/DOMContentUtils.java   | 47 +++++++++++++++++++++-
 .../apache/nutch/parse/tika/DOMContentUtils.java   | 47 ++++++++++++++++++++--
 .../org/apache/nutch/parse/zip/TestZipParser.java  |  7 +++-
 3 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 3c2aba0..0d6b2a9 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -157,9 +157,10 @@ public class DOMContentUtils {
         text = text.replaceAll("\\s+", " ");
         text = text.trim();
         if (text.length() > 0) {
-          if (sb.length() > 0)
-            sb.append(' ');
+          appendSpace(sb);
           sb.append(text);
+        } else {
+          appendParagraphSeparator(sb);
         }
       }
     }
@@ -168,6 +169,48 @@ public class DOMContentUtils {
   }
 
   /**
+   * Conditionally append a paragraph/line break to StringBuffer unless last
+   * character a already indicates a paragraph break. Also remove trailing space
+   * before paragraph break.
+   *
+   * @param buffer
+   *          StringBuffer to append paragraph break
+   */
+  private void appendParagraphSeparator(StringBuffer buffer) {
+    if (buffer.length() == 0) {
+      return;
+    }
+    char lastChar = buffer.charAt(buffer.length() - 1);
+    if ('\n' != lastChar) {
+      // remove white space before paragraph break
+      while (lastChar == ' ') {
+        buffer.deleteCharAt(buffer.length() - 1);
+        lastChar = buffer.charAt(buffer.length() - 1);
+      }
+      if ('\n' != lastChar) {
+        buffer.append('\n');
+      }
+    }
+  }
+
+  /**
+   * Conditionally append a space to StringBuffer unless last character is a
+   * space or line/paragraph break.
+   *
+   * @param buffer
+   *          StringBuffer to append space
+   */
+  private void appendSpace(StringBuffer buffer) {
+    if (buffer.length() == 0) {
+      return;
+    }
+    char lastChar = buffer.charAt(buffer.length() - 1);
+    if (' ' != lastChar && '\n' != lastChar) {
+      buffer.append(' ');
+    }
+  }
+
+  /**
    * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
    * append the content text found beneath the first <code>title</code> node to
    * the <code>StringBuffer</code>.
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index b820e0c..a5b06c7 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -162,11 +162,10 @@ public class DOMContentUtils {
         text = text.replaceAll("\\s+", " ");
         text = text.trim();
         if (text.length() > 0) {
-          if (sb.length() > 0)
-            sb.append(' ');
+          appendSpace(sb);
           sb.append(text);
         } else {
-          sb.append("\n");
+          appendParagraphSeparator(sb);
         }
       }
     }
@@ -175,6 +174,48 @@ public class DOMContentUtils {
   }
 
   /**
+   * Conditionally append a paragraph/line break to StringBuffer unless last
+   * character a already indicates a paragraph break. Also remove trailing space
+   * before paragraph break.
+   *
+   * @param buffer
+   *          StringBuffer to append paragraph break
+   */
+  private void appendParagraphSeparator(StringBuffer buffer) {
+    if (buffer.length() == 0) {
+      return;
+    }
+    char lastChar = buffer.charAt(buffer.length() - 1);
+    if ('\n' != lastChar) {
+      // remove white space before paragraph break
+      while (lastChar == ' ') {
+        buffer.deleteCharAt(buffer.length() - 1);
+        lastChar = buffer.charAt(buffer.length() - 1);
+      }
+      if ('\n' != lastChar) {
+        buffer.append('\n');
+      }
+    }
+  }
+
+  /**
+   * Conditionally append a space to StringBuffer unless last character is a
+   * space or line/paragraph break.
+   *
+   * @param buffer
+   *          StringBuffer to append space
+   */
+  private void appendSpace(StringBuffer buffer) {
+    if (buffer.length() == 0) {
+      return;
+    }
+    char lastChar = buffer.charAt(buffer.length() - 1);
+    if (' ' != lastChar && '\n' != lastChar) {
+      buffer.append(' ');
+    }
+  }
+
+  /**
    * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
    * append the content text found beneath the first <code>title</code> node to
    * the <code>StringBuffer</code>.
diff --git a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
index 17e386a..bbb0866 100644
--- a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
+++ b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
@@ -46,7 +46,7 @@ public class TestZipParser {
 
   private String[] sampleFiles = { "test.zip" };
 
-  private String expectedText = "textfile.txt This is text file number 1 ";
+  private String expectedText = "textfile.txt This is text file number 1";
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
@@ -64,7 +64,10 @@ public class TestZipParser {
           new CrawlDatum()).getContent();
       parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
           content.getUrl());
-      Assert.assertTrue(parse.getText().equals(expectedText));
+      Assert.assertTrue(
+          "Extracted text does not start with <" + expectedText + ">: <"
+              + parse.getText() + ">",
+          parse.getText().startsWith(expectedText));
     }
   }
 

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].