You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by to...@apache.org on 2016/09/08 13:35:28 UTC
[4/4] crunch git commit: CRUNCH-616: Replace (possibly copyrighted)
Maugham text with Dickens. Contributed by Sean Owen.
CRUNCH-616: Replace (possibly copyrighted) Maugham text with Dickens. Contributed by Sean Owen.
Remove non-applicable Project Gutenberg license. Adjust lots of tests to match new text.
Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/5d237b36
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/5d237b36
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/5d237b36
Branch: refs/heads/master
Commit: 5d237b36609484d49c30fa92fdf9613b6eee9d91
Parents: f1d074c
Author: Tom White <to...@apache.org>
Authored: Thu Sep 8 14:12:30 2016 +0100
Committer: Tom White <to...@apache.org>
Committed: Thu Sep 8 14:12:30 2016 +0100
----------------------------------------------------------------------
LICENSE | 298 -
.../it/java/org/apache/crunch/CleanTextIT.java | 2 +-
.../org/apache/crunch/CollectionPObjectIT.java | 4 +-
.../org/apache/crunch/CollectionsLengthIT.java | 4 +-
.../apache/crunch/DeepCopyCustomTuplesIT.java | 2 +-
.../apache/crunch/FirstElementPObjectIT.java | 2 +-
.../it/java/org/apache/crunch/PObjectsIT.java | 2 +-
.../org/apache/crunch/PipelineCallableIT.java | 2 +-
.../it/java/org/apache/crunch/RecordDropIT.java | 2 +-
.../apache/crunch/StageResultsCountersIT.java | 2 +-
.../it/java/org/apache/crunch/WordCountIT.java | 8 +-
.../apache/crunch/impl/mr/plan/DotfilesIT.java | 4 +-
.../it/java/org/apache/crunch/lib/MapredIT.java | 4 +-
.../java/org/apache/crunch/lib/MapreduceIT.java | 2 +-
.../lib/join/AbstractFullOuterJoinIT.java | 4 +-
.../crunch/lib/join/AbstractInnerJoinIT.java | 4 +-
.../lib/join/AbstractLeftOuterJoinIT.java | 4 +-
.../lib/join/AbstractRightOuterJoinIT.java | 4 +-
.../org/apache/crunch/lib/join/JoinTester.java | 6 +-
.../apache/crunch/io/hbase/HFileTargetIT.java | 16 +-
.../scrunch/AggregatorsIntegrationTest.scala | 2 +-
.../org/apache/crunch/scrunch/CogroupTest.scala | 6 +-
.../apache/crunch/scrunch/IncrementTest.scala | 8 +-
.../org/apache/crunch/scrunch/JoinTest.scala | 12 +-
.../apache/crunch/scrunch/PCollectionTest.scala | 6 +-
.../apache/crunch/scrunch/PipelineAppTest.scala | 2 +-
.../org/apache/crunch/scrunch/TopTest.scala | 2 +-
.../org/apache/crunch/scrunch/UnionTest.scala | 12 +-
.../apache/crunch/scrunch/WordCountTest.scala | 2 +-
.../org/apache/crunch/SparkHFileTargetIT.java | 16 +-
.../apache/crunch/SparkPipelineCallableIT.java | 2 +-
crunch-test/src/main/resources/dickens.txt | 23665 ++++++++++++++
crunch-test/src/main/resources/maugham.txt | 29112 -----------------
crunch-test/src/main/resources/shakes.txt | 382 -
34 files changed, 23739 insertions(+), 29866 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index 23c8577..ae4b6b6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -240,304 +240,6 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
---------------------------------------------------------------------------------
-Test cases use content provided by Project Gutenberg:
-
-THE FULL PROJECT GUTENBERG LICENSE
-
-PLEASE READ THIS BEFORE YOU DISTRIBUTE OR USE THIS WORK
-
-To protect the Project Gutenberg-tm mission of promoting the free distribution
-of electronic works, by using or distributing this work (or any other work
-associated in any way with the phrase "Project Gutenberg"), you agree to comply
-with all the terms of the Full Project Gutenberg-tm License available with this
-file or online at www.gutenberg.org/license.
-
-Section 1. General Terms of Use and Redistributing Project Gutenberg-tm
-electronic works 1.A. By reading or using any part of this Project Gutenberg-tm
-electronic work, you indicate that you have read, understand, agree to and
-accept all the terms of this license and intellectual property
-(trademark/copyright) agreement. If you do not agree to abide by all the terms
-of this agreement, you must cease using and return or destroy all copies of
-Project Gutenberg-tm electronic works in your possession. If you paid a fee for
-obtaining a copy of or access to a Project Gutenberg-tm electronic work and you
-do not agree to be bound by the terms of this agreement, you may obtain a
-refund from the person or entity to whom you paid the fee as set forth in
-paragraph 1.E.8.
-
-1.B. "Project Gutenberg" is a registered trademark. It may only be used on or
-associated in any way with an electronic work by people who agree to be bound
-by the terms of this agreement. There are a few things that you can do with
-most Project Gutenberg-tm electronic works even without complying with the full
-terms of this agreement. See paragraph 1.C below. There are a lot of things you
-can do with Project Gutenberg-tm electronic works if you follow the terms of
-this agreement and help preserve free future access to Project Gutenberg-tm
-electronic works. See paragraph 1.E below.
-
-1.C. The Project Gutenberg Literary Archive Foundation ("the Foundation" or
-PGLAF), owns a compilation copyright in the collection of Project Gutenberg-tm
-electronic works. Nearly all the individual works in the collection are in the
-public domain in the United States. If an individual work is in the public
-domain in the United States and you are located in the United States, we do not
-claim a right to prevent you from copying, distributing, performing, displaying
-or creating derivative works based on the work as long as all references to
-Project Gutenberg are removed. Of course, we hope that you will support the
-Project Gutenberg-tm mission of promoting free access to electronic works by
-freely sharing Project Gutenberg-tm works in compliance with the terms of this
-agreement for keeping the Project Gutenberg-tm name associated with the work.
-You can easily comply with the terms of this agreement by keeping this work in
-the same format with its attached full Project Gutenberg-tm License when you
-share it without charge with others.
-
-[*] This particular work is one of the few copyrighted individual works
-included with the permission of the copyright holder. Information on the
-copyright owner for this particular work and the terms of use imposed by the
-copyright holder on this work are set forth at the beginning of this work.
-
-1.D. The copyright laws of the place where you are located also govern what you
-can do with this work. Copyright laws in most countries are in a constant state
-of change. If you are outside the United States, check the laws of your country
-in addition to the terms of this agreement before downloading, copying,
-displaying, performing, distributing or creating derivative works based on this
-work or any other Project Gutenberg-tm work. The Foundation makes no
-representations concerning the copyright status of any work in any country
-outside the United States.
-
-1.E. Unless you have removed all references to Project Gutenberg:
-
-1.E.1. The following sentence, with active links to, or other immediate access
-to, the full Project Gutenberg-tm License must appear prominently whenever any
-copy of a Project Gutenberg-tm work (any work on which the phrase "Project
-Gutenberg" appears, or with which the phrase "Project Gutenberg" is associated)
-is accessed, displayed, performed, viewed, copied or distributed:
-
-This eBook is for the use of anyone anywhere at no cost and with almost no
-restrictions whatsoever. You may copy it, give it away or re-use it under the
-terms of the Project Gutenberg License included with this eBook or online at
-www.gutenberg.org
-
-1.E.2. If an individual Project Gutenberg-tm electronic work is derived from
-the public domain (does not contain a notice indicating that it is posted with
-permission of the copyright holder), the work can be copied and distributed to
-anyone in the United States without paying any fees or charges. If you are
-redistributing or providing access to a work with the phrase "Project
-Gutenberg" associated with or appearing on the work, you must comply either
-with the requirements of paragraphs 1.E.1 through 1.E.7 or obtain permission
-for the use of the work and the Project Gutenberg-tm trademark as set forth in
-paragraphs 1.E.8 or 1.E.9.
-
-1.E.3. If an individual Project Gutenberg-tm electronic work is posted with the
-permission of the copyright holder, your use and distribution must comply with
-both paragraphs 1.E.1 through 1.E.7 and any additional terms imposed by the
-copyright holder. Additional terms will be linked to the Project Gutenberg-tm
-License for all works posted with the permission of the copyright holder found
-at the beginning of this work.
-
-1.E.4. Do not unlink or detach or remove the full Project Gutenberg-tm License
-terms from this work, or any files containing a part of this work or any other
-work associated with Project Gutenberg-tm.
-
-1.E.5. Do not copy, display, perform, distribute or redistribute this
-electronic work, or any part of this electronic work, without prominently
-displaying the sentence set forth in paragraph 1.E.1 with active links or
-immediate access to the full terms of the Project Gutenberg-tm License.
-
-1.E.6. You may convert to and distribute this work in any binary, compressed,
-marked up, nonproprietary or proprietary form, including any word processing or
-hypertext form. However, if you provide access to or distribute copies of a
-Project Gutenberg-tm work in a format other than "Plain Vanilla ASCII" or other
-format used in the official version posted on the official Project Gutenberg-tm
-web site (www.gutenberg.org), you must, at no additional cost, fee or expense
-to the user, provide a copy, a means of exporting a copy, or a means of
-obtaining a copy upon request, of the work in its original "Plain Vanilla
-ASCII" or other form. Any alternate format must include the full Project
-Gutenberg-tm License as specified in paragraph 1.E.1.
-
-1.E.7. Do not charge a fee for access to, viewing, displaying, performing,
-copying or distributing any Project Gutenberg-tm works unless you comply with
-paragraph 1.E.8 or 1.E.9.
-
-1.E.8. You may charge a reasonable fee for copies of or providing access to or
-distributing Project Gutenberg-tm electronic works provided that
-
-You pay a royalty fee of 20% of the gross profits you derive from the use of
-Project Gutenberg-tm works calculated using the method you already use to
-calculate your applicable taxes. The fee is owed to the owner of the Project
-Gutenberg-tm trademark, but he has agreed to donate royalties under this
-paragraph to the Project Gutenberg Literary Archive Foundation. Royalty
-payments must be paid within 60 days following each date on which you prepare
-(or are legally required to prepare) your periodic tax returns. Royalty
-payments should be clearly marked as such and sent to the Project Gutenberg
-Literary Archive Foundation at the address specified in Section 4, "Information
-about donations to the Project Gutenberg Literary Archive Foundation." You
-provide a full refund of any money paid by a user who notifies you in writing
-(or by e-mail) within 30 days of receipt that s/he does not agree to the terms
-of the full Project Gutenberg-tm License. You must require such a user to
-return or destroy all copies of the works possessed in a physical medium and
-discontinue all use of and all access to other copies of Project Gutenberg-tm
-works.
-You provide, in accordance with paragraph 1.F.3, a full refund of any money
-paid for a work or a replacement copy, if a defect in the electronic work is
-discovered and reported to you within 90 days of receipt of the work. You
-comply with all other terms of this agreement for free distribution of Project
-Gutenberg-tm works.
-
-1.E.9. If you wish to charge a fee or distribute a Project Gutenberg-tm
-electronic work or group of works on different terms than are set forth in this
-agreement, you must obtain permission in writing from both the Project
-Gutenberg Literary Archive Foundation and Michael Hart, the owner of the
-Project Gutenberg-tm trademark. Contact the Foundation as set forth in Section
-3 below.
-
-1.F.
-
-1.F.1. Project Gutenberg volunteers and employees expend considerable effort to
-identify, do copyright research on, transcribe and proofread public domain
-works in creating the Project Gutenberg-tm collection. Despite these efforts,
-Project Gutenberg-tm electronic works, and the medium on which they may be
-stored, may contain "Defects," such as, but not limited to, incomplete,
-inaccurate or corrupt data, transcription errors, a copyright or other
-intellectual property infringement, a defective or damaged disk or other
-medium, a computer virus, or computer codes that damage or cannot be read by
-your equipment.
-
-1.F.2. LIMITED WARRANTY, DISCLAIMER OF DAMAGES - Except for the "Right of
-Replacement or Refund" described in paragraph 1.F.3, the Project Gutenberg
-Literary Archive Foundation, the owner of the Project Gutenberg-tm trademark,
-and any other party distributing a Project Gutenberg-tm electronic work under
-this agreement, disclaim all liability to you for damages, costs and expenses,
-including legal fees. YOU AGREE THAT YOU HAVE NO REMEDIES FOR NEGLIGENCE,
-STRICT LIABILITY, BREACH OF WARRANTY OR BREACH OF CONTRACT EXCEPT THOSE
-PROVIDED IN PARAGRAPH 1.F.3. YOU AGREE THAT THE FOUNDATION, THE TRADEMARK
-OWNER, AND ANY DISTRIBUTOR UNDER THIS AGREEMENT WILL NOT BE LIABLE TO YOU FOR
-ACTUAL, DIRECT, INDIRECT, CONSEQUENTIAL, PUNITIVE OR INCIDENTAL DAMAGES EVEN IF
-YOU GIVE NOTICE OF THE POSSIBILITY OF SUCH DAMAGE.
-
-1.F.3. LIMITED RIGHT OF REPLACEMENT OR REFUND - If you discover a defect in
-this electronic work within 90 days of receiving it, you can receive a refund
-of the money (if any) you paid for it by sending a written explanation to the
-person you received the work from. If you received the work on a physical
-medium, you must return the medium with your written explanation. The person or
-entity that provided you with the defective work may elect to provide a
-replacement copy in lieu of a refund. If you received the work electronically,
-the person or entity providing it to you may choose to give you a second
-opportunity to receive the work electronically in lieu of a refund. If the
-second copy is also defective, you may demand a refund in writing without
-further opportunities to fix the problem.
-
-1.F.4. Except for the limited right of replacement or refund set forth in
-paragraph 1.F.3, this work is provided to you 'AS-IS', WITH NO OTHER WARRANTIES
-OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF
-MERCHANTABILITY OR FITNESS FOR ANY PURPOSE.
-
-1.F.5. Some states do not allow disclaimers of certain implied warranties or
-the exclusion or limitation of certain types of damages. If any disclaimer or
-limitation set forth in this agreement violates the law of the state applicable
-to this agreement, the agreement shall be interpreted to make the maximum
-disclaimer or limitation permitted by the applicable state law. The invalidity
-or unenforceability of any provision of this agreement shall not void the
-remaining provisions.
-
-1.F.6. INDEMNITY - You agree to indemnify and hold the Foundation, the
-trademark owner, any agent or employee of the Foundation, anyone providing
-copies of Project Gutenberg-tm electronic works in accordance with this
-agreement, and any volunteers associated with the production, promotion and
-distribution of Project Gutenberg-tm electronic works, harmless from all
-liability, costs and expenses, including legal fees, that arise directly or
-indirectly from any of the following which you do or cause to occur: (a)
-distribution of this or any Project Gutenberg-tm work, (b) alteration,
-modification, or additions or deletions to any Project Gutenberg-tm work, and
-(c) any Defect you cause.
-
-Section 2. Information about the Mission of Project Gutenberg-tm Project
-Gutenberg-tm is synonymous with the free distribution of electronic works in
-formats readable by the widest variety of computers including obsolete, old,
-middle-aged and new computers. It exists because of the efforts of hundreds of
-volunteers and donations from people in all walks of life.
-
-Volunteers and financial support to provide volunteers with the assistance they
-need are critical to reaching Project Gutenberg-tm's goals and ensuring that
-the Project Gutenberg-tm collection will remain freely available for
-generations to come. In 2001, the Project Gutenberg Literary Archive Foundation
-was created to provide a secure and permanent future for Project Gutenberg-tm
-and future generations. To learn more about the Project Gutenberg Literary
-Archive Foundation and how your efforts and donations can help, see Sections 3
-and 4 and the Foundation information page at www.gutenberg.org
-
-Section 3. Information about the Project Gutenberg Literary Archive Foundation
-The Project Gutenberg Literary Archive Foundation is a non profit 501(c)(3)
-educational corporation organized under the laws of the state of Mississippi
-and granted tax exempt status by the Internal Revenue Service. The Foundation's
-EIN or federal tax identification number is 64-6221541. Contributions to the
-Project Gutenberg Literary Archive Foundation are tax deductible to the full
-extent permitted by U.S. federal laws and your state's laws.
-
-The Foundation's principal office is located at 4557 Melan Dr. S. Fairbanks,
-AK, 99712., but its volunteers and employees are scattered throughout numerous
-locations. Its business office is located at 809 North 1500 West, Salt Lake
-City, UT 84116, (801) 596-1887. Email contact links and up to date contact
-information can be found at the Foundation's web site and official page at
-www.gutenberg.org/contact
-
-For additional contact information:
-
- Dr. Gregory B. Newby
- Chief Executive and Director
- gbnewby@pglaf.org
-
-Section 4. Information about Donations to the Project Gutenberg Literary
-Archive Foundation Project Gutenberg-tm depends upon and cannot survive without
-wide spread public support and donations to carry out its mission of increasing
-the number of public domain and licensed works that can be freely distributed
-in machine readable form accessible by the widest array of equipment including
-outdated equipment. Many small donations ($1 to $5,000) are particularly
-important to maintaining tax exempt status with the IRS.
-
-The Foundation is committed to complying with the laws regulating charities and
-charitable donations in all 50 states of the United States. Compliance
-requirements are not uniform and it takes a considerable effort, much paperwork
-and many fees to meet and keep up with these requirements. We do not solicit
-donations in locations where we have not received written confirmation of
-compliance. To SEND DONATIONS or determine the status of compliance for any
-particular state visit www.gutenberg.org/donate
-
-While we cannot and do not solicit contributions from states where we have not
-met the solicitation requirements, we know of no prohibition against accepting
-unsolicited donations from donors in such states who approach us with offers to
-donate.
-
-International donations are gratefully accepted, but we cannot make any
-statements concerning tax treatment of donations received from outside the
-United States. U.S. laws alone swamp our small staff.
-
-Please check the Project Gutenberg Web pages for current donation methods and
-addresses. Donations are accepted in a number of other ways including checks,
-online payments and credit card donations. To donate, please visit:
-www.gutenberg.org/donate
-
-Section 5. General Information About Project Gutenberg-tm electronic works.
-Professor Michael S. Hart was the originator of the Project Gutenberg-tm
-concept of a library of electronic works that could be freely shared with
-anyone. For forty years, he produced and distributed Project Gutenberg-tm
-eBooks with only a loose network of volunteer support.
-
-Project Gutenberg-tm eBooks are often created from several printed editions,
-all of which are confirmed as Public Domain in the U.S. unless a copyright
-notice is included. Thus, we do not necessarily keep eBooks in compliance with
-any particular paper edition.
-
-Most people start at our Web site which has the main PG search facility:
-www.gutenberg.org
-
-This Web site includes information about Project Gutenberg-tm, including how to
-make donations to the Project Gutenberg Literary Archive Foundation, how to
-help produce our new eBooks, and how to subscribe to our email newsletter to
-hear about new eBooks.
-
-[*] This paragraph, after 1.C., is included only for copyrighted works. For
-those, you must contact the copyright holder before any non-free use or removal
-of the Project Gutenberg header.
-
================================================================================
The binary distribution for Apache Crunch includes the following
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
index 9d6f682..563af07 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
@@ -41,7 +41,7 @@ import com.google.common.io.Files;
*/
public class CleanTextIT {
- private static final int LINES_IN_SHAKES = 3667;
+ private static final int LINES_IN_SHAKES = 3285;
@Rule
public TemporaryPath tmpDir = TemporaryPaths.create();
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
index 7e0c75c..08e5ac2 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
@@ -37,10 +37,10 @@ import org.junit.Test;
@SuppressWarnings("serial")
public class CollectionPObjectIT {
- private static final int LINES_IN_SHAKES = 3667;
+ private static final int LINES_IN_SHAKES = 3285;
private static final String FIRST_SHAKESPEARE_LINE =
- "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+ "The Tragedie of Macbeth";
private static final String LAST_SHAKESPEARE_LINE =
"FINIS. THE TRAGEDIE OF MACBETH.";
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
index f1a33a2..f676bab 100644
--- a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
@@ -34,7 +34,7 @@ import org.junit.Test;
@SuppressWarnings("serial")
public class CollectionsLengthIT {
- public static final Long LINES_IN_SHAKESPEARE = 3667L;
+ public static final Long LINES_IN_SHAKESPEARE = 3285L;
@Rule
public TemporaryPath tmpDir = TemporaryPaths.create();
@@ -64,6 +64,6 @@ public class CollectionsLengthIT {
PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
Long length = shakespeare.length().getValue();
- assertEquals("Incorrect length for shakespear PCollection.", LINES_IN_SHAKESPEARE, length);
+ assertEquals("Incorrect length for Shakespeare PCollection.", LINES_IN_SHAKESPEARE, length);
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
index f1323ca..54f9917 100644
--- a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
@@ -54,7 +54,7 @@ public class DeepCopyCustomTuplesIT {
.groupByKey()
.parallelDo(new PostProcFn(), strings())
.materialize();
- assertEquals(65, Iterables.size(out));
+ assertEquals(59, Iterables.size(out));
p.done();
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
index d985e10..a016c12 100644
--- a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
@@ -36,7 +36,7 @@ import org.junit.Test;
public class FirstElementPObjectIT {
private static final String FIRST_SHAKESPEARE_LINE =
- "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+ "The Tragedie of Macbeth";
@Rule
public TemporaryPath tmpDir = TemporaryPaths.create();
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
index 6ee849f..42c046a 100644
--- a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
@@ -37,7 +37,7 @@ import org.junit.Test;
@SuppressWarnings("serial")
public class PObjectsIT {
- private static final Integer LINES_IN_SHAKES = 3667;
+ private static final Integer LINES_IN_SHAKES = 3285;
@Rule
public TemporaryPath tmpDir = TemporaryPaths.create();
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
index 95638a1..ff5dc60 100644
--- a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java
@@ -95,7 +95,7 @@ public class PipelineCallableIT {
assertFalse(p.run().succeeded());
} else {
Map<String, Long> counts = top3.materializeToMap();
- assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts);
+ assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts);
assertEquals(17, INC1);
assertEquals(29, INC2);
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
index 8c4c57f..3a82a19 100644
--- a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java
@@ -54,7 +54,7 @@ public class RecordDropIT {
}
int index = 0;
for (Iterable<Integer> iter : values) {
- assertEquals("Checking index = " + index, 3667, Iterables.getFirst(iter, 0).intValue());
+ assertEquals("Checking index = " + index, 3285, Iterables.getFirst(iter, 0).intValue());
index++;
}
p.done();
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
index e74c166..45f3afd 100644
--- a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
@@ -89,7 +89,7 @@ public class StageResultsCountersIT {
Map<String, Long> keywordsMap = countersToMap(result.getStageResults(), KEYWORDS_COUNTER_GROUP);
- assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 157L, "AND", 596L, "OR", 81L)));
+ assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 145L, "AND", 544L, "OR", 37L)));
}
private static PipelineResult coutSpecialKeywords(Pipeline pipeline, String inputFileName, PTypeFamily tf) {
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
index e0bd719..257c917 100644
--- a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
@@ -150,8 +150,8 @@ public class WordCountIT {
PTable<String, Long> wordCount = wordCount(shakespeare, tf);
List<Pair<String, Long>> top5 = Lists.newArrayList(Aggregate.top(wordCount, 5, true).materialize());
assertEquals(
- ImmutableList.of(Pair.of("", 1470L), Pair.of("the", 620L), Pair.of("and", 427L), Pair.of("of", 396L),
- Pair.of("to", 367L)), top5);
+ ImmutableList.of(Pair.of("", 1345L), Pair.of("the", 528L), Pair.of("and", 375L), Pair.of("I", 314L),
+ Pair.of("of", 314L)), top5);
}
public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
@@ -191,14 +191,14 @@ public class WordCountIT {
assertEquals(2, stageResults.size());
} else {
assertEquals(1, stageResults.size());
- assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
+ assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
}
File outputFile = new File(outputPath, "part-r-00000");
List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
boolean passed = false;
for (String line : lines) {
- if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
+ if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) {
passed = true;
break;
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
index 98ae8d1..c33348a 100644
--- a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java
@@ -159,13 +159,13 @@ public class DotfilesIT {
List<PipelineResult.StageResult> stageResults = res.getStageResults();
assertEquals(1, stageResults.size());
- assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
+ assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
File outputFile = new File(outputPath, "part-r-00000");
List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
boolean passed = false;
for (String line : lines) {
- if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
+ if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) {
passed = true;
break;
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
index 7c09790..6feff1f 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java
@@ -109,7 +109,7 @@ public class MapredIT extends CrunchTestSupport implements Serializable {
PipelineResult res = p.done();
assertEquals(1, res.getStageResults().size());
StageResult sr = res.getStageResults().get(0);
- assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
+ assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
}
@Test
@@ -129,6 +129,6 @@ public class MapredIT extends CrunchTestSupport implements Serializable {
PipelineResult res = p.done();
assertEquals(1, res.getStageResults().size());
StageResult sr = res.getStageResults().get(0);
- assertEquals(108, sr.getCounters().findCounter("thou", "count").getValue());
+ assertEquals(103, sr.getCounters().findCounter("thou", "count").getValue());
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
index ab453e0..9510457 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java
@@ -95,7 +95,7 @@ public class MapreduceIT extends CrunchTestSupport implements Serializable {
PipelineResult res = p.done();
assertEquals(1, res.getStageResults().size());
StageResult sr = res.getStageResults().get(0);
- assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
+ assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue());
}
@Test
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
index 24e67b5..77edd8b 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractFullOuterJoinIT extends JoinTester {
boolean passed2 = false;
boolean passed3 = false;
for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
+ if ("wretched".equals(line.first()) && 19 == line.second()) {
passed1 = true;
}
if ("againe".equals(line.first()) && 10 == line.second()) {
passed2 = true;
}
- if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+ if ("moon".equals(line.first()) && 9 == line.second()) {
passed3 = true;
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
index 8ceaa03..a13ff27 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractInnerJoinIT extends JoinTester {
boolean passed2 = true;
boolean passed3 = true;
for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
+ if ("wretched".equals(line.first()) && 19 == line.second()) {
passed1 = true;
}
if ("againe".equals(line.first())) {
passed2 = false;
}
- if ("Montparnasse.".equals(line.first())) {
+ if ("moon".equals(line.first())) {
passed3 = false;
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
index 241f5ad..43b4118 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractLeftOuterJoinIT extends JoinTester {
boolean passed2 = false;
boolean passed3 = true;
for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
+ if ("wretched".equals(line.first()) && 19 == line.second()) {
passed1 = true;
}
if ("againe".equals(line.first()) && 10 == line.second()) {
passed2 = true;
}
- if ("Montparnasse.".equals(line.first())) {
+ if ("moon".equals(line.first())) {
passed3 = false;
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
index 43e0479..e5e7b4e 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java
@@ -28,13 +28,13 @@ public abstract class AbstractRightOuterJoinIT extends JoinTester {
boolean passed2 = true;
boolean passed3 = false;
for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
+ if ("wretched".equals(line.first()) && 19 == line.second()) {
passed1 = true;
}
if ("againe".equals(line.first())) {
passed2 = false;
}
- if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+ if ("moon".equals(line.first()) && 9 == line.second()) {
passed3 = true;
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
index 700cba5..3ada7e0 100644
--- a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
@@ -72,11 +72,11 @@ public abstract class JoinTester implements Serializable {
protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- String maughamInputPath = tmpDir.copyResourceFileName("maugham.txt");
+ String dickensInputPath = tmpDir.copyResourceFileName("dickens.txt");
PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- PCollection<String> maugham = pipeline.readTextFile(maughamInputPath);
- PTable<String, Long> joined = join(shakespeare, maugham, typeFamily);
+ PCollection<String> dickens = pipeline.readTextFile(dickensInputPath);
+ PTable<String, Long> joined = join(shakespeare, dickens, typeFamily);
Iterable<Pair<String, Long>> lines = joined.materialize();
assertPassed(lines);
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
index af24865..9027c1b 100644
--- a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
+++ b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java
@@ -196,7 +196,7 @@ public class HFileTargetIT implements Serializable {
FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
KeyValue kv = readFromHFiles(fs, outputPath, "and");
- assertEquals(427L, Bytes.toLong(kv.getValue()));
+ assertEquals(375L, Bytes.toLong(kv.getValue()));
}
@Test
@@ -223,11 +223,11 @@ public class HFileTargetIT implements Serializable {
.doBulkLoad(outputPath, testTable);
Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder()
- .put("__EMPTY__", 1470L)
- .put("the", 620L)
- .put("and", 427L)
- .put("of", 396L)
- .put("to", 367L)
+ .put("__EMPTY__", 1345L)
+ .put("the", 528L)
+ .put("and", 375L)
+ .put("I", 314L)
+ .put("of", 314L)
.build();
for (Map.Entry<String, Long> e : EXPECTED.entrySet()) {
@@ -270,8 +270,8 @@ public class HFileTargetIT implements Serializable {
loader.doBulkLoad(outputPath1, table1);
loader.doBulkLoad(outputPath2, table2);
- assertEquals(396L, getWordCountFromTable(table1, "of"));
- assertEquals(427L, getWordCountFromTable(table2, "and"));
+ assertEquals(314L, getWordCountFromTable(table1, "of"));
+ assertEquals(375L, getWordCountFromTable(table2, "and"));
}
@Test
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
index 94a6e12..c79783d 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala
@@ -34,7 +34,7 @@ class AggregatorsIntegrationTest extends CrunchSuite {
.groupByKey
.combineValues(Aggregators.product[(Long, Int)](Aggregators.sum[Long], Aggregators.max[Int]))
.materialize
- assert(fcc.exists(_ == ("w", (1404, 12))))
+ assert(fcc.exists(_ == ("w", (1302, 12))))
pipeline.done
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
index c7e53ae..fb994ca 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala
@@ -31,10 +31,10 @@ class CogroupTest extends CrunchSuite {
@Test def cogroup {
val shakespeare = tempDir.copyResourceFileName("shakes.txt")
- val maugham = tempDir.copyResourceFileName("maugham.txt")
- val diffs = wordCount(shakespeare).cogroup(wordCount(maugham))
+ val dickens = tempDir.copyResourceFileName("dickens.txt")
+ val diffs = wordCount(shakespeare).cogroup(wordCount(dickens))
.map((k, v) => (k, (v._1.sum - v._2.sum))).materialize
- assert(diffs.exists(_ == ("the", -11390)))
+ assert(diffs.exists(_ == ("the", -11043)))
pipeline.done
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
index 44aa9a8..d480d22 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala
@@ -46,9 +46,9 @@ class IncrementTest extends CrunchSuite {
val res = pipeline.done()
val sr0 = res.getStageResults.get(0)
- assertEquals(21836, sr0.getCounterValue("TOP", "ALLWORDS"))
- assertEquals(20366, sr0.getCounterValue("TOP", "NONEMPTY"))
- assertEquals(3604, sr0.getCounterValue("TOP", "AWORDS_2x"))
- assertEquals(20366, sr0.getCounterValue("Inc", "A"))
+ assertEquals(19082, sr0.getCounterValue("TOP", "ALLWORDS"))
+ assertEquals(17737, sr0.getCounterValue("TOP", "NONEMPTY"))
+ assertEquals(3088, sr0.getCounterValue("TOP", "AWORDS_2x"))
+ assertEquals(17737, sr0.getCounterValue("Inc", "A"))
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
index 35a6500..8947ce6 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala
@@ -34,25 +34,25 @@ class JoinTest extends CrunchSuite {
@Test def join {
val shakespeare = tempDir.copyResourceFileName("shakes.txt")
- val maugham = tempDir.copyResourceFileName("maugham.txt")
+ val dickens = tempDir.copyResourceFileName("dickens.txt")
val output = tempDir.getFile("output")
- val filtered = wordCount(shakespeare).join(wordCount(maugham))
+ val filtered = wordCount(shakespeare).join(wordCount(dickens))
.map((k, v) => (k, v._1 - v._2))
.write(to.textFile(output.getAbsolutePath()))
.filter((k, d) => d > 0).materialize
- assert(filtered.exists(_ == ("macbeth", 66)))
+ assert(filtered.exists(_ == ("noble", 9)))
pipeline.done
}
@Test def joinMapside {
val shakespeare = tempDir.copyResourceFileName("shakes.txt")
- val maugham = tempDir.copyResourceFileName("maugham.txt")
+ val dickens = tempDir.copyResourceFileName("dickens.txt")
val output = tempDir.getFile("output")
- val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(maugham), Joins.mapside())
+ val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(dickens), Joins.mapside())
.map((k, v) => (k, v._1 - v._2))
.write(to.textFile(output.getAbsolutePath()))
.filter((k, d) => d > 0).materialize
- assert(filtered.exists(_ == ("macbeth", 66)))
+ assert(filtered.exists(_ == ("noble", 9)))
pipeline.done
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 3c232b1..b81165f 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -30,11 +30,11 @@ import org.scalatest.junit.JUnitSuite
class PCollectionTest extends CrunchSuite {
// Number of lines in the Shakespeare data set.
- val linesInShakespeare: Int = 3667
+ val linesInShakespeare: Int = 3285
// The first line in the Shakespeare data set.
val firstLineInShakespeare: String =
- "***The Project Gutenberg's Etext of Shakespeare's First Folio***"
+ "The Tragedie of Macbeth"
// The last line in the Shakespeare data set.
val lastLineInShakespeare: String =
@@ -79,6 +79,6 @@ class PCollectionTest extends CrunchSuite {
// With a seed of 1L, 380 elements should be sampled.
val sampledCollection = shakespeare.sample(0.10, 1L)
val length = sampledCollection.length().value()
- assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length)
+ assertEquals("Incorrect number of elements sampled with seed 1L.", 338L, length)
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
index c566e59..c5a56fc 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala
@@ -40,7 +40,7 @@ class PipelineAppTest extends CrunchSuite {
@Test def run {
val args = new Array[String](3)
args(0) = tempDir.copyResourceFileName("shakes.txt")
- args(1) = tempDir.copyResourceFileName("maugham.txt")
+ args(1) = tempDir.copyResourceFileName("dickens.txt")
args(2) = tempDir.getFileName("output")
tempDir.overridePathProperties(WordCount.configuration)
WordCount.main(args)
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
index 186ec27..416251b 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala
@@ -35,6 +35,6 @@ class TopTest extends CrunchSuite {
val wc = pipeline.read(from.textFile(input))
.flatMap(_.toLowerCase.split("\\s+"))
.filter(!_.isEmpty()).count
- assert(wc.top(10, true).materialize.exists(_ == ("is", 205)))
+ assert(wc.top(10, true).materialize.exists(_ == ("is", 175)))
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
index f62cef3..aebd2df 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala
@@ -30,21 +30,21 @@ class UnionTest extends CrunchSuite {
@Test def testUnionCollection {
val shakespeare = tempDir.copyResourceFileName("shakes.txt")
- val maugham = tempDir.copyResourceFileName("maugham.txt")
+ val dickens = tempDir.copyResourceFileName("dickens.txt")
val union = pipeline.read(from.textFile(shakespeare)).union(
- pipeline.read(from.textFile(maugham)))
+ pipeline.read(from.textFile(dickens)))
val wc = wordCount(union).materialize
- assert(wc.exists(_ == ("you", 3691)))
+ assert(wc.exists(_ == ("you", 2552)))
pipeline.done
}
@Test def testUnionTable {
val shakespeare = tempDir.copyResourceFileName("shakes.txt")
- val maugham = tempDir.copyResourceFileName("maugham.txt")
+ val dickens = tempDir.copyResourceFileName("dickens.txt")
val wcs = wordCount(pipeline.read(from.textFile(shakespeare)))
- val wcm = wordCount(pipeline.read(from.textFile(maugham)))
+ val wcm = wordCount(pipeline.read(from.textFile(dickens)))
val wc = wcs.union(wcm).groupByKey.combine(v => v.sum).materialize
- assert(wc.exists(_ == ("you", 3691)))
+ assert(wc.exists(_ == ("you", 2552)))
pipeline.done
}
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
index 7ee4de0..bac56f9 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala
@@ -33,7 +33,7 @@ class WordCountTest extends CrunchSuite {
.write(to.textFile(wordCountOut)) // Word counts
.map((w, c) => (w.slice(0, 1), c))
.groupByKey.combine(v => v.sum).materialize
- assert(fcc.exists(_ == ("w", 1404)))
+ assert(fcc.exists(_ == ("w", 1302)))
pipeline.done
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
index 8126e81..815aaff 100644
--- a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
+++ b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java
@@ -170,7 +170,7 @@ public class SparkHFileTargetIT implements Serializable {
FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
KeyValue kv = readFromHFiles(fs, outputPath, "and");
- assertEquals(427L, Bytes.toLong(kv.getValue()));
+ assertEquals(375L, Bytes.toLong(kv.getValue()));
pipeline.done();
}
@@ -199,11 +199,11 @@ public class SparkHFileTargetIT implements Serializable {
.doBulkLoad(outputPath, testTable);
Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder()
- .put("__EMPTY__", 1470L)
- .put("the", 620L)
- .put("and", 427L)
- .put("of", 396L)
- .put("to", 367L)
+ .put("__EMPTY__", 1345L)
+ .put("the", 528L)
+ .put("and", 375L)
+ .put("I", 314L)
+ .put("of", 314L)
.build();
for (Map.Entry<String, Long> e : EXPECTED.entrySet()) {
@@ -246,8 +246,8 @@ public class SparkHFileTargetIT implements Serializable {
loader.doBulkLoad(outputPath1, table1);
loader.doBulkLoad(outputPath2, table2);
- assertEquals(396L, getWordCountFromTable(table1, "of"));
- assertEquals(427L, getWordCountFromTable(table2, "and"));
+ assertEquals(314L, getWordCountFromTable(table1, "of"));
+ assertEquals(375L, getWordCountFromTable(table2, "and"));
pipeline.done();
}
http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
----------------------------------------------------------------------
diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
index d799842..de0f893 100644
--- a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
+++ b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java
@@ -90,7 +90,7 @@ public class SparkPipelineCallableIT extends CrunchTestSupport {
assertFalse(p.run().succeeded());
} else {
Map<String, Long> counts = top3.materializeToMap();
- assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts);
+ assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts);
assertEquals(17, INC1);
assertEquals(29, INC2);
}