You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by to...@apache.org on 2016/09/08 13:35:28 UTC
[4/4] crunch git commit: CRUNCH-616: Replace (possibly copyrighted) Maugham text with Dickens. Contributed by Sean Owen.

CRUNCH-616: Replace (possibly copyrighted) Maugham text with Dickens. Contributed by Sean Owen.

Remove non-applicable Project Gutenberg license. Adjust lots of tests to match new text.


Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/5d237b36
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/5d237b36
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/5d237b36

Branch: refs/heads/master
Commit: 5d237b36609484d49c30fa92fdf9613b6eee9d91
Parents: f1d074c
Author: Tom White <to...@apache.org>
Authored: Thu Sep 8 14:12:30 2016 +0100
Committer: Tom White <to...@apache.org>
Committed: Thu Sep 8 14:12:30 2016 +0100

----------------------------------------------------------------------
 LICENSE                                         |   298 -
 .../it/java/org/apache/crunch/CleanTextIT.java  |     2 +-
 .../org/apache/crunch/CollectionPObjectIT.java  |     4 +-
 .../org/apache/crunch/CollectionsLengthIT.java  |     4 +-
 .../apache/crunch/DeepCopyCustomTuplesIT.java   |     2 +-
 .../apache/crunch/FirstElementPObjectIT.java    |     2 +-
 .../it/java/org/apache/crunch/PObjectsIT.java   |     2 +-
 .../org/apache/crunch/PipelineCallableIT.java   |     2 +-
 .../it/java/org/apache/crunch/RecordDropIT.java |     2 +-
 .../apache/crunch/StageResultsCountersIT.java   |     2 +-
 .../it/java/org/apache/crunch/WordCountIT.java  |     8 +-
 .../apache/crunch/impl/mr/plan/DotfilesIT.java  |     4 +-
 .../it/java/org/apache/crunch/lib/MapredIT.java |     4 +-
 .../java/org/apache/crunch/lib/MapreduceIT.java |     2 +-
 .../lib/join/AbstractFullOuterJoinIT.java       |     4 +-
 .../crunch/lib/join/AbstractInnerJoinIT.java    |     4 +-
 .../lib/join/AbstractLeftOuterJoinIT.java       |     4 +-
 .../lib/join/AbstractRightOuterJoinIT.java      |     4 +-
 .../org/apache/crunch/lib/join/JoinTester.java  |     6 +-
 .../apache/crunch/io/hbase/HFileTargetIT.java   |    16 +-
 .../scrunch/AggregatorsIntegrationTest.scala    |     2 +-
 .../org/apache/crunch/scrunch/CogroupTest.scala |     6 +-
 .../apache/crunch/scrunch/IncrementTest.scala   |     8 +-
 .../org/apache/crunch/scrunch/JoinTest.scala    |    12 +-
 .../apache/crunch/scrunch/PCollectionTest.scala |     6 +-
 .../apache/crunch/scrunch/PipelineAppTest.scala |     2 +-
 .../org/apache/crunch/scrunch/TopTest.scala     |     2 +-
 .../org/apache/crunch/scrunch/UnionTest.scala   |    12 +-
 .../apache/crunch/scrunch/WordCountTest.scala   |     2 +-
 .../org/apache/crunch/SparkHFileTargetIT.java   |    16 +-
 .../apache/crunch/SparkPipelineCallableIT.java  |     2 +-
 crunch-test/src/main/resources/dickens.txt      | 23665 ++++++++++++++
 crunch-test/src/main/resources/maugham.txt      | 29112 -----------------
 crunch-test/src/main/resources/shakes.txt       |   382 -
 34 files changed, 23739 insertions(+), 29866 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index 23c8577..ae4b6b6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -240,304 +240,6 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
 
---------------------------------------------------------------------------------
-Test cases use content provided by Project Gutenberg:
-
-THE FULL PROJECT GUTENBERG LICENSE
-
-PLEASE READ THIS BEFORE YOU DISTRIBUTE OR USE THIS WORK
-
-To protect the Project Gutenberg-tm mission of promoting the free distribution
-of electronic works, by using or distributing this work (or any other work
-associated in any way with the phrase "Project Gutenberg"), you agree to comply
-with all the terms of the Full Project Gutenberg-tm License available with this
-file or online at www.gutenberg.org/license.
-
-Section 1. General Terms of Use and Redistributing Project Gutenberg-tm
-electronic works 1.A. By reading or using any part of this Project Gutenberg-tm
-electronic work, you indicate that you have read, understand, agree to and
-accept all the terms of this license and intellectual property
-(trademark/copyright) agreement. If you do not agree to abide by all the terms
-of this agreement, you must cease using and return or destroy all copies of
-Project Gutenberg-tm electronic works in your possession. If you paid a fee for
-obtaining a copy of or access to a Project Gutenberg-tm electronic work and you
-do not agree to be bound by the terms of this agreement, you may obtain a
-refund from the person or entity to whom you paid the fee as set forth in
-paragraph 1.E.8.
-
-1.B. "Project Gutenberg" is a registered trademark. It may only be used on or
-associated in any way with an electronic work by people who agree to be bound
-by the terms of this agreement. There are a few things that you can do with
-most Project Gutenberg-tm electronic works even without complying with the full
-terms of this agreement. See paragraph 1.C below. There are a lot of things you
-can do with Project Gutenberg-tm electronic works if you follow the terms of
-this agreement and help preserve free future access to Project Gutenberg-tm
-electronic works. See paragraph 1.E below.
-
-1.C. The Project Gutenberg Literary Archive Foundation ("the Foundation" or
-PGLAF), owns a compilation copyright in the collection of Project Gutenberg-tm
-electronic works. Nearly all the individual works in the collection are in the
-public domain in the United States. If an individual work is in the public
-domain in the United States and you are located in the United States, we do not
-claim a right to prevent you from copying, distributing, performing, displaying
-or creating derivative works based on the work as long as all references to
-Project Gutenberg are removed. Of course, we hope that you will support the
-Project Gutenberg-tm mission of promoting free access to electronic works by
-freely sharing Project Gutenberg-tm works in compliance with the terms of this
-agreement for keeping the Project Gutenberg-tm name associated with the work.
-You can easily comply with the terms of this agreement by keeping this work in
-the same format with its attached full Project Gutenberg-tm License when you
-share it without charge with others.
-
-[*] This particular work is one of the few copyrighted individual works
-included with the permission of the copyright holder. Information on the
-copyright owner for this particular work and the terms of use imposed by the
-copyright holder on this work are set forth at the beginning of this work.
-
-1.D. The copyright laws of the place where you are located also govern what you
-can do with this work. Copyright laws in most countries are in a constant state
-of change. If you are outside the United States, check the laws of your country
-in addition to the terms of this agreement before downloading, copying,
-displaying, performing, distributing or creating derivative works based on this
-work or any other Project Gutenberg-tm work. The Foundation makes no
-representations concerning the copyright status of any work in any country
-outside the United States.
-
-1.E. Unless you have removed all references to Project Gutenberg:
-
-1.E.1. The following sentence, with active links to, or other immediate access
-to, the full Project Gutenberg-tm License must appear prominently whenever any
-copy of a Project Gutenberg-tm work (any work on which the phrase "Project
-Gutenberg" appears, or with which the phrase "Project Gutenberg" is associated)
-is accessed, displayed, performed, viewed, copied or distributed:
-
-This eBook is for the use of anyone anywhere at no cost and with almost no
-restrictions whatsoever. You may copy it, give it away or re-use it under the
-terms of the Project Gutenberg License included with this eBook or online at
-www.gutenberg.org
-
-1.E.2. If an individual Project Gutenberg-tm electronic work is derived from
-the public domain (does not contain a notice indicating that it is posted with
-permission of the copyright holder), the work can be copied and distributed to
-anyone in the United States without paying any fees or charges. If you are
-redistributing or providing access to a work with the phrase "Project
-Gutenberg" associated with or appearing on the work, you must comply either
-with the requirements of paragraphs 1.E.1 through 1.E.7 or obtain permission
-for the use of the work and the Project Gutenberg-tm trademark as set forth in
-paragraphs 1.E.8 or 1.E.9.
-
-1.E.3. If an individual Project Gutenberg-tm electronic work is posted with the
-permission of the copyright holder, your use and distribution must comply with
-both paragraphs 1.E.1 through 1.E.7 and any additional terms imposed by the
-copyright holder. Additional terms will be linked to the Project Gutenberg-tm
-License for all works posted with the permission of the copyright holder found
-at the beginning of this work.
-
-1.E.4. Do not unlink or detach or remove the full Project Gutenberg-tm License
-terms from this work, or any files containing a part of this work or any other
-work associated with Project Gutenberg-tm.
-
-1.E.5. Do not copy, display, perform, distribute or redistribute this
-electronic work, or any part of this electronic work, without prominently
-displaying the sentence set forth in paragraph 1.E.1 with active links or
-immediate access to the full terms of the Project Gutenberg-tm License.
-
-1.E.6. You may convert to and distribute this work in any binary, compressed,
-marked up, nonproprietary or proprietary form, including any word processing or
-hypertext form. However, if you provide access to or distribute copies of a
-Project Gutenberg-tm work in a format other than "Plain Vanilla ASCII" or other
-format used in the official version posted on the official Project Gutenberg-tm
-web site (www.gutenberg.org), you must, at no additional cost, fee or expense
-to the user, provide a copy, a means of exporting a copy, or a means of
-obtaining a copy upon request, of the work in its original "Plain Vanilla
-ASCII" or other form. Any alternate format must include the full Project
-Gutenberg-tm License as specified in paragraph 1.E.1.
-
-1.E.7. Do not charge a fee for access to, viewing, displaying, performing,
-copying or distributing any Project Gutenberg-tm works unless you comply with
-paragraph 1.E.8 or 1.E.9.
-
-1.E.8. You may charge a reasonable fee for copies of or providing access to or
-distributing Project Gutenberg-tm electronic works provided that
-
-You pay a royalty fee of 20% of the gross profits you derive from the use of
-Project Gutenberg-tm works calculated using the method you already use to
-calculate your applicable taxes. The fee is owed to the owner of the Project
-Gutenberg-tm trademark, but he has agreed to donate royalties under this
-paragraph to the Project Gutenberg Literary Archive Foundation. Royalty
-payments must be paid within 60 days following each date on which you prepare
-(or are legally required to prepare) your periodic tax returns. Royalty
-payments should be clearly marked as such and sent to the Project Gutenberg
-Literary Archive Foundation at the address specified in Section 4, "Information
-about donations to the Project Gutenberg Literary Archive Foundation." You
-provide a full refund of any money paid by a user who notifies you in writing
-(or by e-mail) within 30 days of receipt that s/he does not agree to the terms
-of the full Project Gutenberg-tm License. You must require such a user to
-return or destroy all copies of the works possessed in a physical medium and
-discontinue all use of and all access to other copies of Project Gutenberg-tm
-works.
-You provide, in accordance with paragraph 1.F.3, a full refund of any money
-paid for a work or a replacement copy, if a defect in the electronic work is
-discovered and reported to you within 90 days of receipt of the work.  You
-comply with all other terms of this agreement for free distribution of Project
-Gutenberg-tm works.
-
-1.E.9. If you wish to charge a fee or distribute a Project Gutenberg-tm
-electronic work or group of works on different terms than are set forth in this
-agreement, you must obtain permission in writing from both the Project
-Gutenberg Literary Archive Foundation and Michael Hart, the owner of the
-Project Gutenberg-tm trademark. Contact the Foundation as set forth in Section
-3 below.
-
-1.F.
-
-1.F.1. Project Gutenberg volunteers and employees expend considerable effort to
-identify, do copyright research on, transcribe and proofread public domain
-works in creating the Project Gutenberg-tm collection. Despite these efforts,
-Project Gutenberg-tm electronic works, and the medium on which they may be
-stored, may contain "Defects," such as, but not limited to, incomplete,
-inaccurate or corrupt data, transcription errors, a copyright or other
-intellectual property infringement, a defective or damaged disk or other
-medium, a computer virus, or computer codes that damage or cannot be read by
-your equipment.
-
-1.F.2. LIMITED WARRANTY, DISCLAIMER OF DAMAGES - Except for the "Right of
-Replacement or Refund" described in paragraph 1.F.3, the Project Gutenberg
-Literary Archive Foundation, the owner of the Project Gutenberg-tm trademark,
-and any other party distributing a Project Gutenberg-tm electronic work under
-this agreement, disclaim all liability to you for damages, costs and expenses,
-including legal fees. YOU AGREE THAT YOU HAVE NO REMEDIES FOR NEGLIGENCE,
-STRICT LIABILITY, BREACH OF WARRANTY OR BREACH OF CONTRACT EXCEPT THOSE
-PROVIDED IN PARAGRAPH 1.F.3. YOU AGREE THAT THE FOUNDATION, THE TRADEMARK
-OWNER, AND ANY DISTRIBUTOR UNDER THIS AGREEMENT WILL NOT BE LIABLE TO YOU FOR
-ACTUAL, DIRECT, INDIRECT, CONSEQUENTIAL, PUNITIVE OR INCIDENTAL DAMAGES EVEN IF
-YOU GIVE NOTICE OF THE POSSIBILITY OF SUCH DAMAGE.
-
-1.F.3. LIMITED RIGHT OF REPLACEMENT OR REFUND - If you discover a defect in
-this electronic work within 90 days of receiving it, you can receive a refund
-of the money (if any) you paid for it by sending a written explanation to the
-person you received the work from. If you received the work on a physical
-medium, you must return the medium with your written explanation. The person or
-entity that provided you with the defective work may elect to provide a
-replacement copy in lieu of a refund. If you received the work electronically,
-the person or entity providing it to you may choose to give you a second
-opportunity to receive the work electronically in lieu of a refund. If the
-second copy is also defective, you may demand a refund in writing without
-further opportunities to fix the problem.
-
-1.F.4. Except for the limited right of replacement or refund set forth in
-paragraph 1.F.3, this work is provided to you 'AS-IS', WITH NO OTHER WARRANTIES
-OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF
-MERCHANTABILITY OR FITNESS FOR ANY PURPOSE.
-
-1.F.5. Some states do not allow disclaimers of certain implied warranties or
-the exclusion or limitation of certain types of damages. If any disclaimer or
-limitation set forth in this agreement violates the law of the state applicable
-to this agreement, the agreement shall be interpreted to make the maximum
-disclaimer or limitation permitted by the applicable state law. The invalidity
-or unenforceability of any provision of this agreement shall not void the
-remaining provisions.
-
-1.F.6. INDEMNITY - You agree to indemnify and hold the Foundation, the
-trademark owner, any agent or employee of the Foundation, anyone providing
-copies of Project Gutenberg-tm electronic works in accordance with this
-agreement, and any volunteers associated with the production, promotion and
-distribution of Project Gutenberg-tm electronic works, harmless from all
-liability, costs and expenses, including legal fees, that arise directly or
-indirectly from any of the following which you do or cause to occur: (a)
-distribution of this or any Project Gutenberg-tm work, (b) alteration,
-modification, or additions or deletions to any Project Gutenberg-tm work, and
-(c) any Defect you cause.
-
-Section 2. Information about the Mission of Project Gutenberg-tm Project
-Gutenberg-tm is synonymous with the free distribution of electronic works in
-formats readable by the widest variety of computers including obsolete, old,
-middle-aged and new computers. It exists because of the efforts of hundreds of
-volunteers and donations from people in all walks of life.
-
-Volunteers and financial support to provide volunteers with the assistance they
-need are critical to reaching Project Gutenberg-tm's goals and ensuring that
-the Project Gutenberg-tm collection will remain freely available for
-generations to come. In 2001, the Project Gutenberg Literary Archive Foundation
-was created to provide a secure and permanent future for Project Gutenberg-tm
-and future generations. To learn more about the Project Gutenberg Literary
-Archive Foundation and how your efforts and donations can help, see Sections 3
-and 4 and the Foundation information page at www.gutenberg.org
-
-Section 3. Information about the Project Gutenberg Literary Archive Foundation
-The Project Gutenberg Literary Archive Foundation is a non profit 501(c)(3)
-educational corporation organized under the laws of the state of Mississippi
-and granted tax exempt status by the Internal Revenue Service. The Foundation's
-EIN or federal tax identification number is 64-6221541. Contributions to the
-Project Gutenberg Literary Archive Foundation are tax deductible to the full
-extent permitted by U.S. federal laws and your state's laws.
-
-The Foundation's principal office is located at 4557 Melan Dr. S. Fairbanks,
-AK, 99712., but its volunteers and employees are scattered throughout numerous
-locations. Its business office is located at 809 North 1500 West, Salt Lake
-City, UT 84116, (801) 596-1887. Email contact links and up to date contact
-information can be found at the Foundation's web site and official page at
-www.gutenberg.org/contact
-
-For additional contact information:
-
-    Dr. Gregory B. Newby
-    Chief Executive and Director
-    gbnewby@pglaf.org
-
-Section 4. Information about Donations to the Project Gutenberg Literary
-Archive Foundation Project Gutenberg-tm depends upon and cannot survive without
-wide spread public support and donations to carry out its mission of increasing
-the number of public domain and licensed works that can be freely distributed
-in machine readable form accessible by the widest array of equipment including
-outdated equipment. Many small donations ($1 to $5,000) are particularly
-important to maintaining tax exempt status with the IRS.
-
-The Foundation is committed to complying with the laws regulating charities and
-charitable donations in all 50 states of the United States. Compliance
-requirements are not uniform and it takes a considerable effort, much paperwork
-and many fees to meet and keep up with these requirements. We do not solicit
-donations in locations where we have not received written confirmation of
-compliance. To SEND DONATIONS or determine the status of compliance for any
-particular state visit www.gutenberg.org/donate
-
-While we cannot and do not solicit contributions from states where we have not
-met the solicitation requirements, we know of no prohibition against accepting
-unsolicited donations from donors in such states who approach us with offers to
-donate.
-
-International donations are gratefully accepted, but we cannot make any
-statements concerning tax treatment of donations received from outside the
-United States. U.S. laws alone swamp our small staff.
-
-Please check the Project Gutenberg Web pages for current donation methods and
-addresses. Donations are accepted in a number of other ways including checks,
-online payments and credit card donations. To donate, please visit:
-www.gutenberg.org/donate
-
-Section 5. General Information About Project Gutenberg-tm electronic works.
-Professor Michael S. Hart was the originator of the Project Gutenberg-tm
-concept of a library of electronic works that could be freely shared with
-anyone. For forty years, he produced and distributed Project Gutenberg-tm
-eBooks with only a loose network of volunteer support.
-
-Project Gutenberg-tm eBooks are often created from several printed editions,
-all of which are confirmed as Public Domain in the U.S. unless a copyright
-notice is included. Thus, we do not necessarily keep eBooks in compliance with
-any particular paper edition.
-
-Most people start at our Web site which has the main PG search facility:
-www.gutenberg.org
-
-This Web site includes information about Project Gutenberg-tm, including how to
-make donations to the Project Gutenberg Literary Archive Foundation, how to
-help produce our new eBooks, and how to subscribe to our email newsletter to
-hear about new eBooks.
-
-[*] This paragraph, after 1.C., is included only for copyrighted works. For
-those, you must contact the copyright holder before any non-free use or removal
-of the Project Gutenberg header.
-
 
 ================================================================================
 The binary distribution for Apache Crunch includes the following

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
index 9d6f682..563af07 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
@@ -41,7 +41,7 @@ import com.google.common.io.Files;
  */
 public class CleanTextIT {
 
-  private static final int LINES_IN_SHAKES = 3667;
+  private static final int LINES_IN_SHAKES = 3285;
   
   @Rule
   public TemporaryPath tmpDir = TemporaryPaths.create();

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
index 7e0c75c..08e5ac2 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
@@ -37,10 +37,10 @@ import org.junit.Test;
 @SuppressWarnings("serial")
 public class CollectionPObjectIT {
 
-  private static final int LINES_IN_SHAKES = 3667;
+  private static final int LINES_IN_SHAKES = 3285;
 
   private static final String FIRST_SHAKESPEARE_LINE =
-      "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+      "The Tragedie of Macbeth";
 
   private static final String LAST_SHAKESPEARE_LINE =
       "FINIS. THE TRAGEDIE OF MACBETH.";

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
index f1a33a2..f676bab 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
@@ -34,7 +34,7 @@ import org.junit.Test;
 @SuppressWarnings("serial")
 public class CollectionsLengthIT {
 
-  public static final Long LINES_IN_SHAKESPEARE = 3667L;
+  public static final Long LINES_IN_SHAKESPEARE = 3285L;
 
   @Rule
   public TemporaryPath tmpDir = TemporaryPaths.create();
@@ -64,6 +64,6 @@ public class CollectionsLengthIT {
 
     PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
     Long length = shakespeare.length().getValue();
-    assertEquals("Incorrect length for shakespear PCollection.", LINES_IN_SHAKESPEARE, length);
+    assertEquals("Incorrect length for Shakespeare PCollection.", LINES_IN_SHAKESPEARE, length);
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
index f1323ca..54f9917 100644
--- a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
@@ -54,7 +54,7 @@ public class DeepCopyCustomTuplesIT {
         .groupByKey()
         .parallelDo(new PostProcFn(), strings())
         .materialize();
-    assertEquals(65, Iterables.size(out));
+    assertEquals(59, Iterables.size(out));
     p.done();
   }
   

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
index d985e10..a016c12 100644
--- a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
@@ -36,7 +36,7 @@ import org.junit.Test;
 public class FirstElementPObjectIT {
 
   private static final String FIRST_SHAKESPEARE_LINE =
-      "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+      "The Tragedie of Macbeth";
 
   @Rule
   public TemporaryPath tmpDir = TemporaryPaths.create();

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
index 6ee849f..42c046a 100644
--- a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
@@ -37,7 +37,7 @@ import org.junit.Test;
 @SuppressWarnings("serial")
 public class PObjectsIT {
 
-  private static final Integer LINES_IN_SHAKES = 3667;
+  private static final Integer LINES_IN_SHAKES = 3285;
 
   @Rule
   public TemporaryPath tmpDir = TemporaryPaths.create();

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
index 95638a1..ff5dc60 100644
--- a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
@@ -95,7 +95,7 @@ public class PipelineCallableIT {
       assertFalse(p.run().succeeded());
     } else {
       Map<String, Long> counts = top3.materializeToMap();
-      assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts);
+      assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts);
       assertEquals(17, INC1);
       assertEquals(29, INC2);
     }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
index 8c4c57f..3a82a19 100644
--- a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
@@ -54,7 +54,7 @@ public class RecordDropIT {
     }
     int index = 0;
     for (Iterable<Integer> iter : values) {
-      assertEquals("Checking index = " + index, 3667, Iterables.getFirst(iter, 0).intValue());
+      assertEquals("Checking index = " + index, 3285, Iterables.getFirst(iter, 0).intValue());
       index++;
     }
     p.done();

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
index e74c166..45f3afd 100644
--- a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
@@ -89,7 +89,7 @@ public class StageResultsCountersIT {
 
     Map<String, Long> keywordsMap = countersToMap(result.getStageResults(), KEYWORDS_COUNTER_GROUP);
 
-    assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 157L, "AND", 596L, "OR", 81L)));
+    assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 145L, "AND", 544L, "OR", 37L)));
   }
 
   private static PipelineResult coutSpecialKeywords(Pipeline pipeline, String inputFileName, PTypeFamily tf) {

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
index e0bd719..257c917 100644
--- a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
@@ -150,8 +150,8 @@ public class WordCountIT {
     PTable<String, Long> wordCount = wordCount(shakespeare, tf);
     List<Pair<String, Long>> top5 = Lists.newArrayList(Aggregate.top(wordCount, 5, true).materialize());
     assertEquals(
-        ImmutableList.of(Pair.of("", 1470L), Pair.of("the", 620L), Pair.of("and", 427L), Pair.of("of", 396L),
-            Pair.of("to", 367L)), top5);
+        ImmutableList.of(Pair.of("", 1345L), Pair.of("the", 528L), Pair.of("and", 375L), Pair.of("I", 314L),
+            Pair.of("of", 314L)), top5);
   }
 
   public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
@@ -191,14 +191,14 @@ public class WordCountIT {
       assertEquals(2, stageResults.size());
     } else {
       assertEquals(1, stageResults.size());
-      assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
+      assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
     }
 
     File outputFile = new File(outputPath, "part-r-00000");
     List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
     boolean passed = false;
     for (String line : lines) {
-      if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
+      if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) {
         passed = true;
         break;
       }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
index 98ae8d1..c33348a 100644
--- a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
@@ -159,13 +159,13 @@ public class DotfilesIT {
     List<PipelineResult.StageResult> stageResults = res.getStageResults();
 
     assertEquals(1, stageResults.size());
-    assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
+    assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
 
     File outputFile = new File(outputPath, "part-r-00000");
     List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
     boolean passed = false;
     for (String line : lines) {
-      if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
+      if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) {
         passed = true;
         break;
       }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
index 7c09790..6feff1f 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
@@ -109,7 +109,7 @@ public class MapredIT extends CrunchTestSupport implements Serializable {
     PipelineResult res = p.done();
     assertEquals(1, res.getStageResults().size());
     StageResult sr = res.getStageResults().get(0);
-    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
+    assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
   }
   
   @Test
@@ -129,6 +129,6 @@ public class MapredIT extends CrunchTestSupport implements Serializable {
     PipelineResult res = p.done();
     assertEquals(1, res.getStageResults().size());
     StageResult sr = res.getStageResults().get(0);
-    assertEquals(108, sr.getCounters().findCounter("thou", "count").getValue());
+    assertEquals(103, sr.getCounters().findCounter("thou", "count").getValue());
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
index ab453e0..9510457 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
@@ -95,7 +95,7 @@ public class MapreduceIT extends CrunchTestSupport implements Serializable {
     PipelineResult res = p.done();
     assertEquals(1, res.getStageResults().size());
     StageResult sr = res.getStageResults().get(0);
-    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
+    assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
   }
   
   @Test

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
index 24e67b5..77edd8b 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractFullOuterJoinIT extends JoinTester {
     boolean passed2 = false;
     boolean passed3 = false;
     for (Pair<String, Long> line : lines) {
-      if ("wretched".equals(line.first()) && 24 == line.second()) {
+      if ("wretched".equals(line.first()) && 19 == line.second()) {
         passed1 = true;
       }
       if ("againe".equals(line.first()) && 10 == line.second()) {
         passed2 = true;
       }
-      if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+      if ("moon".equals(line.first()) && 9 == line.second()) {
         passed3 = true;
       }
     }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
index 8ceaa03..a13ff27 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractInnerJoinIT extends JoinTester {
     boolean passed2 = true;
     boolean passed3 = true;
     for (Pair<String, Long> line : lines) {
-      if ("wretched".equals(line.first()) && 24 == line.second()) {
+      if ("wretched".equals(line.first()) && 19 == line.second()) {
         passed1 = true;
       }
       if ("againe".equals(line.first())) {
         passed2 = false;
       }
-      if ("Montparnasse.".equals(line.first())) {
+      if ("moon".equals(line.first())) {
         passed3 = false;
       }
     }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
index 241f5ad..43b4118 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractLeftOuterJoinIT extends JoinTester {
     boolean passed2 = false;
     boolean passed3 = true;
     for (Pair<String, Long> line : lines) {
-      if ("wretched".equals(line.first()) && 24 == line.second()) {
+      if ("wretched".equals(line.first()) && 19 == line.second()) {
         passed1 = true;
       }
       if ("againe".equals(line.first()) && 10 == line.second()) {
         passed2 = true;
       }
-      if ("Montparnasse.".equals(line.first())) {
+      if ("moon".equals(line.first())) {
         passed3 = false;
       }
     }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
index 43e0479..e5e7b4e 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractRightOuterJoinIT extends JoinTester {
     boolean passed2 = true;
     boolean passed3 = false;
     for (Pair<String, Long> line : lines) {
-      if ("wretched".equals(line.first()) && 24 == line.second()) {
+      if ("wretched".equals(line.first()) && 19 == line.second()) {
         passed1 = true;
       }
       if ("againe".equals(line.first())) {
         passed2 = false;
       }
-      if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+      if ("moon".equals(line.first()) && 9 == line.second()) {
         passed3 = true;
       }
     }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
index 700cba5..3ada7e0 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
@@ -72,11 +72,11 @@ public abstract class JoinTester implements Serializable {
 
   protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
     String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
-    String maughamInputPath = tmpDir.copyResourceFileName("maugham.txt");
+    String dickensInputPath = tmpDir.copyResourceFileName("dickens.txt");
 
     PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
-    PCollection<String> maugham = pipeline.readTextFile(maughamInputPath);
-    PTable<String, Long> joined = join(shakespeare, maugham, typeFamily);
+    PCollection<String> dickens = pipeline.readTextFile(dickensInputPath);
+    PTable<String, Long> joined = join(shakespeare, dickens, typeFamily);
     Iterable<Pair<String, Long>> lines = joined.materialize();
 
     assertPassed(lines);

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
index af24865..9027c1b 100644
--- a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
+++ b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
@@ -196,7 +196,7 @@ public class HFileTargetIT implements Serializable {
 
     FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
     KeyValue kv = readFromHFiles(fs, outputPath, "and");
-    assertEquals(427L, Bytes.toLong(kv.getValue()));
+    assertEquals(375L, Bytes.toLong(kv.getValue()));
   }
 
   @Test
@@ -223,11 +223,11 @@ public class HFileTargetIT implements Serializable {
         .doBulkLoad(outputPath, testTable);
 
     Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder()
-        .put("__EMPTY__", 1470L)
-        .put("the", 620L)
-        .put("and", 427L)
-        .put("of", 396L)
-        .put("to", 367L)
+        .put("__EMPTY__", 1345L)
+        .put("the", 528L)
+        .put("and", 375L)
+        .put("I", 314L)
+        .put("of", 314L)
         .build();
 
     for (Map.Entry<String, Long> e : EXPECTED.entrySet()) {
@@ -270,8 +270,8 @@ public class HFileTargetIT implements Serializable {
     loader.doBulkLoad(outputPath1, table1);
     loader.doBulkLoad(outputPath2, table2);
 
-    assertEquals(396L, getWordCountFromTable(table1, "of"));
-    assertEquals(427L, getWordCountFromTable(table2, "and"));
+    assertEquals(314L, getWordCountFromTable(table1, "of"));
+    assertEquals(375L, getWordCountFromTable(table2, "and"));
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
index 94a6e12..c79783d 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
@@ -34,7 +34,7 @@ class AggregatorsIntegrationTest extends CrunchSuite {
       .groupByKey
       .combineValues(Aggregators.product[(Long, Int)](Aggregators.sum[Long], Aggregators.max[Int]))
       .materialize
-    assert(fcc.exists(_ == ("w", (1404, 12))))
+    assert(fcc.exists(_ == ("w", (1302, 12))))
 
     pipeline.done
   }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
index c7e53ae..fb994ca 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
@@ -31,10 +31,10 @@ class CogroupTest extends CrunchSuite {
 
   @Test def cogroup {
     val shakespeare = tempDir.copyResourceFileName("shakes.txt")
-    val maugham = tempDir.copyResourceFileName("maugham.txt")
-    val diffs = wordCount(shakespeare).cogroup(wordCount(maugham))
+    val dickens = tempDir.copyResourceFileName("dickens.txt")
+    val diffs = wordCount(shakespeare).cogroup(wordCount(dickens))
         .map((k, v) => (k, (v._1.sum - v._2.sum))).materialize
-    assert(diffs.exists(_ == ("the", -11390)))
+    assert(diffs.exists(_ == ("the", -11043)))
     pipeline.done
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
index 44aa9a8..d480d22 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
@@ -46,9 +46,9 @@ class IncrementTest extends CrunchSuite {
 
     val res = pipeline.done()
     val sr0 = res.getStageResults.get(0)
-    assertEquals(21836, sr0.getCounterValue("TOP", "ALLWORDS"))
-    assertEquals(20366, sr0.getCounterValue("TOP", "NONEMPTY"))
-    assertEquals(3604, sr0.getCounterValue("TOP", "AWORDS_2x"))
-    assertEquals(20366, sr0.getCounterValue("Inc", "A"))
+    assertEquals(19082, sr0.getCounterValue("TOP", "ALLWORDS"))
+    assertEquals(17737, sr0.getCounterValue("TOP", "NONEMPTY"))
+    assertEquals(3088, sr0.getCounterValue("TOP", "AWORDS_2x"))
+    assertEquals(17737, sr0.getCounterValue("Inc", "A"))
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
index 35a6500..8947ce6 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
@@ -34,25 +34,25 @@ class JoinTest extends CrunchSuite {
 
   @Test def join {
     val shakespeare = tempDir.copyResourceFileName("shakes.txt")
-    val maugham = tempDir.copyResourceFileName("maugham.txt")
+    val dickens = tempDir.copyResourceFileName("dickens.txt")
     val output = tempDir.getFile("output")
-    val filtered = wordCount(shakespeare).join(wordCount(maugham))
+    val filtered = wordCount(shakespeare).join(wordCount(dickens))
         .map((k, v) => (k, v._1 - v._2))
         .write(to.textFile(output.getAbsolutePath()))
         .filter((k, d) => d > 0).materialize
-    assert(filtered.exists(_ == ("macbeth", 66)))
+    assert(filtered.exists(_ == ("noble", 9)))
     pipeline.done
   }
 
   @Test def joinMapside {
     val shakespeare = tempDir.copyResourceFileName("shakes.txt")
-    val maugham = tempDir.copyResourceFileName("maugham.txt")
+    val dickens = tempDir.copyResourceFileName("dickens.txt")
     val output = tempDir.getFile("output")
-    val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(maugham), Joins.mapside())
+    val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(dickens), Joins.mapside())
       .map((k, v) => (k, v._1 - v._2))
       .write(to.textFile(output.getAbsolutePath()))
       .filter((k, d) => d > 0).materialize
-    assert(filtered.exists(_ == ("macbeth", 66)))
+    assert(filtered.exists(_ == ("noble", 9)))
     pipeline.done
   }
 

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 3c232b1..b81165f 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -30,11 +30,11 @@ import org.scalatest.junit.JUnitSuite
 class PCollectionTest extends CrunchSuite {
 
   // Number of lines in the Shakespeare data set.
-  val linesInShakespeare: Int = 3667
+  val linesInShakespeare: Int = 3285
 
   // The first line in the Shakespeare data set.
   val firstLineInShakespeare: String =
-      "***The Project Gutenberg's Etext of Shakespeare's First Folio***"
+      "The Tragedie of Macbeth"
 
   // The last line in the Shakespeare data set.
   val lastLineInShakespeare: String =
@@ -79,6 +79,6 @@ class PCollectionTest extends CrunchSuite {
     // With a seed of 1L, 380 elements should be sampled.
     val sampledCollection = shakespeare.sample(0.10, 1L)
     val length = sampledCollection.length().value()
-    assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length)
+    assertEquals("Incorrect number of elements sampled with seed 1L.", 338L, length)
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
index c566e59..c5a56fc 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
@@ -40,7 +40,7 @@ class PipelineAppTest extends CrunchSuite {
   @Test def run {
     val args = new Array[String](3)
     args(0) = tempDir.copyResourceFileName("shakes.txt")
-    args(1) = tempDir.copyResourceFileName("maugham.txt")
+    args(1) = tempDir.copyResourceFileName("dickens.txt")
     args(2) = tempDir.getFileName("output")
     tempDir.overridePathProperties(WordCount.configuration)
     WordCount.main(args)

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
index 186ec27..416251b 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
@@ -35,6 +35,6 @@ class TopTest extends CrunchSuite {
     val wc = pipeline.read(from.textFile(input))
         .flatMap(_.toLowerCase.split("\\s+"))
         .filter(!_.isEmpty()).count
-    assert(wc.top(10, true).materialize.exists(_ == ("is", 205)))
+    assert(wc.top(10, true).materialize.exists(_ == ("is", 175)))
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
index f62cef3..aebd2df 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
@@ -30,21 +30,21 @@ class UnionTest extends CrunchSuite {
 
   @Test def testUnionCollection {
     val shakespeare = tempDir.copyResourceFileName("shakes.txt")
-    val maugham = tempDir.copyResourceFileName("maugham.txt")
+    val dickens = tempDir.copyResourceFileName("dickens.txt")
     val union = pipeline.read(from.textFile(shakespeare)).union(
-        pipeline.read(from.textFile(maugham)))
+        pipeline.read(from.textFile(dickens)))
     val wc = wordCount(union).materialize
-    assert(wc.exists(_ == ("you", 3691)))
+    assert(wc.exists(_ == ("you", 2552)))
     pipeline.done
   }
 
   @Test def testUnionTable {
     val shakespeare = tempDir.copyResourceFileName("shakes.txt")
-    val maugham = tempDir.copyResourceFileName("maugham.txt")
+    val dickens = tempDir.copyResourceFileName("dickens.txt")
     val wcs = wordCount(pipeline.read(from.textFile(shakespeare)))
-    val wcm = wordCount(pipeline.read(from.textFile(maugham)))
+    val wcm = wordCount(pipeline.read(from.textFile(dickens)))
     val wc = wcs.union(wcm).groupByKey.combine(v => v.sum).materialize
-    assert(wc.exists(_ == ("you", 3691)))
+    assert(wc.exists(_ == ("you", 2552)))
     pipeline.done
   }
 }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
index 7ee4de0..bac56f9 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
@@ -33,7 +33,7 @@ class WordCountTest extends CrunchSuite {
         .write(to.textFile(wordCountOut)) // Word counts
         .map((w, c) => (w.slice(0, 1), c))
         .groupByKey.combine(v => v.sum).materialize
-    assert(fcc.exists(_ == ("w", 1404)))
+    assert(fcc.exists(_ == ("w", 1302)))
 
     pipeline.done
   }

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
index 8126e81..815aaff 100644
--- a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
+++ b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
@@ -170,7 +170,7 @@ public class SparkHFileTargetIT implements Serializable {
 
     FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
     KeyValue kv = readFromHFiles(fs, outputPath, "and");
-    assertEquals(427L, Bytes.toLong(kv.getValue()));
+    assertEquals(375L, Bytes.toLong(kv.getValue()));
     pipeline.done();
   }
 
@@ -199,11 +199,11 @@ public class SparkHFileTargetIT implements Serializable {
             .doBulkLoad(outputPath, testTable);
 
     Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder()
-            .put("__EMPTY__", 1470L)
-            .put("the", 620L)
-            .put("and", 427L)
-            .put("of", 396L)
-            .put("to", 367L)
+            .put("__EMPTY__", 1345L)
+            .put("the", 528L)
+            .put("and", 375L)
+            .put("I", 314L)
+            .put("of", 314L)
             .build();
 
     for (Map.Entry<String, Long> e : EXPECTED.entrySet()) {
@@ -246,8 +246,8 @@ public class SparkHFileTargetIT implements Serializable {
     loader.doBulkLoad(outputPath1, table1);
     loader.doBulkLoad(outputPath2, table2);
 
-    assertEquals(396L, getWordCountFromTable(table1, "of"));
-    assertEquals(427L, getWordCountFromTable(table2, "and"));
+    assertEquals(314L, getWordCountFromTable(table1, "of"));
+    assertEquals(375L, getWordCountFromTable(table2, "and"));
     pipeline.done();
   }
 

http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
----------------------------------------------------------------------
diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
index d799842..de0f893 100644
--- a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
+++ b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
@@ -90,7 +90,7 @@ public class SparkPipelineCallableIT extends CrunchTestSupport {
       assertFalse(p.run().succeeded());
     } else {
       Map<String, Long> counts = top3.materializeToMap();
-      assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts);
+      assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts);
       assertEquals(17, INC1);
       assertEquals(29, INC2);
     }