You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by ch...@apache.org on 2019/07/04 11:35:22 UTC
[flink] branch master updated: [FLINK-13019][coordination][tests]
Add ITCase for fine-grained recovery
This is an automated email from the ASF dual-hosted git repository.
chesnay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink.git
The following commit(s) were added to refs/heads/master by this push:
new f333ea4 [FLINK-13019][coordination][tests] Add ITCase for fine-grained recovery
f333ea4 is described below
commit f333ea42aae292933fbc10b3039c40262f346164
Author: azagrebin <az...@users.noreply.github.com>
AuthorDate: Thu Jul 4 13:35:12 2019 +0200
[FLINK-13019][coordination][tests] Add ITCase for fine-grained recovery
The simplest IT test is to create a batch job with a sequence of non-parallel mappers.
The each mapper writes to a blocking partition which means the next mapper starts when the previous is done.
The mappers are not chained into one Task which makes them separate failover regions.
---
.../recovery/BatchFineGrainedRecoveryITCase.java | 250 +++++++++++++++++++++
1 file changed, 250 insertions(+)
diff --git a/flink-tests/src/test/java/org/apache/flink/test/recovery/BatchFineGrainedRecoveryITCase.java b/flink-tests/src/test/java/org/apache/flink/test/recovery/BatchFineGrainedRecoveryITCase.java
new file mode 100644
index 0000000..9c66293
--- /dev/null
+++ b/flink-tests/src/test/java/org/apache/flink/test/recovery/BatchFineGrainedRecoveryITCase.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.test.recovery;
+
+import org.apache.flink.api.common.ExecutionMode;
+import org.apache.flink.api.common.functions.RichMapPartitionFunction;
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.api.common.time.Time;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.JobManagerOptions;
+import org.apache.flink.runtime.minicluster.TestingMiniCluster;
+import org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration.Builder;
+import org.apache.flink.test.util.TestEnvironment;
+import org.apache.flink.util.Collector;
+import org.apache.flink.util.FlinkRuntimeException;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+import static org.apache.flink.configuration.JobManagerOptions.FORCE_PARTITION_RELEASE_ON_CONSUMPTION;
+import static org.apache.flink.runtime.executiongraph.failover.FailoverStrategyLoader.PIPELINED_REGION_RESTART_STRATEGY_NAME;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+/**
+ * IT case for fine-grained recovery of batch jobs.
+ *
+ * <p>The test activates the region fail-over strategy to restart only failed producers.
+ * The test job is a sequence of non-parallel mappers. Each mapper writes to a blocking partition which means
+ * the next mapper starts when the previous is done. The mappers are not chained into one task which makes them
+ * separate fail-over regions.
+ *
+ * <p>The test verifies that fine-grained recovery works by randomly incuding failures in any of the mappers.
+ * Since all mappers are connected via blocking partitions, which should be re-used on failure, and the consumer
+ * of the mapper wasn't deployed yet, as the consumed partition was not fully produced yet, only the failed mapper
+ * should actually restart.
+ */
+public class BatchFineGrainedRecoveryITCase extends TestLogger {
+ private static final int EMITTED_RECORD_NUMBER = 1000;
+ private static final int MAX_FAILURE_NUMBER = 10;
+ private static final int MAP_NUMBER = 3;
+ private static final List<Long> EXPECTED_JOB_OUTPUT =
+ LongStream.range(MAP_NUMBER, EMITTED_RECORD_NUMBER + MAP_NUMBER).boxed().collect(Collectors.toList());
+
+ private TestingMiniCluster miniCluster;
+
+ @Before
+ public void setup() throws Exception {
+ Configuration configuration = new Configuration();
+ configuration.setBoolean(FORCE_PARTITION_RELEASE_ON_CONSUMPTION, false);
+ configuration.setString(JobManagerOptions.EXECUTION_FAILOVER_STRATEGY, PIPELINED_REGION_RESTART_STRATEGY_NAME);
+
+ miniCluster = new TestingMiniCluster(
+ new Builder()
+ .setNumTaskManagers(MAP_NUMBER)
+ .setNumSlotsPerTaskManager(1)
+ .setConfiguration(configuration)
+ .build(),
+ null);
+
+ miniCluster.start();
+ }
+
+ @After
+ public void teardown() throws Exception {
+ if (miniCluster != null) {
+ miniCluster.close();
+ }
+ }
+
+ @Test
+ public void testProgram() throws Exception {
+ ExecutionEnvironment env = createExecutionEnvironment();
+
+ StaticFailureCounter.reset();
+ StaticMapFailureTracker.reset();
+
+ FailureStrategy failureStrategy = new RandomExceptionFailureStrategy(1, EMITTED_RECORD_NUMBER);
+
+ DataSet<Long> input = env.generateSequence(0, EMITTED_RECORD_NUMBER - 1);
+ for (int i = 0; i < MAP_NUMBER; i++) {
+ input = input
+ .mapPartition(new TestPartitionMapper(StaticMapFailureTracker.addNewMap(), failureStrategy))
+ .name("Test partition mapper " + i);
+ }
+ assertThat(input.collect(), is(EXPECTED_JOB_OUTPUT));
+
+ StaticMapFailureTracker.verify();
+ }
+
+ private ExecutionEnvironment createExecutionEnvironment() {
+ @SuppressWarnings("StaticVariableUsedBeforeInitialization")
+ ExecutionEnvironment env = new TestEnvironment(miniCluster, 1, true);
+ env.setRestartStrategy(RestartStrategies.fixedDelayRestart(MAX_FAILURE_NUMBER, Time.milliseconds(10)));
+ env.getConfig().setExecutionMode(ExecutionMode.BATCH_FORCED); // forces all partitions to be blocking
+ return env;
+ }
+
+ private enum StaticMapFailureTracker {
+ ;
+
+ private static final List<AtomicInteger> mapRestarts = new ArrayList<>(10);
+ private static final List<AtomicInteger> expectedMapRestarts = new ArrayList<>(10);
+
+ private static void reset() {
+ mapRestarts.clear();
+ expectedMapRestarts.clear();
+ }
+
+ private static int addNewMap() {
+ mapRestarts.add(new AtomicInteger(0));
+ expectedMapRestarts.add(new AtomicInteger(1));
+ return mapRestarts.size() - 1;
+ }
+
+ private static void mapRestart(int index) {
+ mapRestarts.get(index).incrementAndGet();
+ }
+
+ private static void mapFailure(int index) {
+ expectedMapRestarts.get(index).incrementAndGet();
+ }
+
+ private static void verify() {
+ assertThat(collect(mapRestarts), is(collect(expectedMapRestarts)));
+ }
+
+ private static int[] collect(Collection<AtomicInteger> list) {
+ return list.stream().mapToInt(AtomicInteger::get).toArray();
+ }
+ }
+
+ @FunctionalInterface
+ private interface FailureStrategy extends Serializable {
+ void failOrNot();
+ }
+
+ private static class RandomExceptionFailureStrategy implements FailureStrategy {
+ private static final long serialVersionUID = 1L;
+
+ private final CoinToss coin;
+
+ private RandomExceptionFailureStrategy(int probFraction, int probBase) {
+ this.coin = new CoinToss(probFraction, probBase);
+ }
+
+ @Override
+ public void failOrNot() {
+ if (coin.toss() && StaticFailureCounter.failOrNot()) {
+ throw new FlinkRuntimeException("BAGA-BOOM!!! The user function generated test failure.");
+ }
+ }
+ }
+
+ private static class CoinToss implements Serializable {
+ private static final long serialVersionUID = 1L;
+ private static final Random rnd = new Random();
+
+ private final int probFraction;
+ private final int probBase;
+
+ private CoinToss(int probFraction, int probBase) {
+ this.probFraction = probFraction;
+ this.probBase = probBase;
+ }
+
+ private boolean toss() {
+ int prob = rnd.nextInt(probBase) + 1;
+ return prob <= probFraction;
+ }
+ }
+
+ private static class TestPartitionMapper extends RichMapPartitionFunction<Long, Long> {
+ private static final long serialVersionUID = 1L;
+
+ private final int trackingIndex;
+ private final FailureStrategy failureStrategy;
+
+ private TestPartitionMapper(int trackingIndex, FailureStrategy failureStrategy) {
+ this.trackingIndex = trackingIndex;
+ this.failureStrategy = failureStrategy;
+ }
+
+ @Override
+ public void open(Configuration parameters) throws Exception {
+ super.open(parameters);
+ StaticMapFailureTracker.mapRestart(trackingIndex);
+ }
+
+ @Override
+ public void mapPartition(Iterable<Long> values, Collector<Long> out) {
+ values.forEach(value -> {
+ failOrNot();
+ out.collect(value + 1);
+ });
+ }
+
+ private void failOrNot() {
+ try {
+ failureStrategy.failOrNot();
+ } catch (Throwable t) {
+ StaticMapFailureTracker.mapFailure(trackingIndex);
+ throw t;
+ }
+ }
+ }
+
+ private enum StaticFailureCounter {
+ ;
+
+ private static final AtomicInteger failureNumber = new AtomicInteger(0);
+
+ private static boolean failOrNot() {
+ return failureNumber.incrementAndGet() < MAX_FAILURE_NUMBER;
+ }
+
+ private static void reset() {
+ failureNumber.set(0);
+ }
+ }
+}