You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@gobblin.apache.org by GitBox <gi...@apache.org> on 2021/06/18 21:18:48 UTC

[GitHub] [gobblin] aplex commented on a change in pull request #3296: Add initial version of automatic troubleshooter

aplex commented on a change in pull request #3296:
URL: https://github.com/apache/gobblin/pull/3296#discussion_r654101048



##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/AutomaticTroubleshooter.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.runtime.troubleshooter;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.text.TextStringBuilder;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.AbstractIdleService;
+
+import javax.inject.Inject;
+import javax.inject.Singleton;
+import lombok.extern.slf4j.Slf4j;
+
+
+/**
+ * Automatic troubleshooter will identify and prioritize the problems with the job, and display a summary to the user.
+ *
+ * Troubleshooter will collect errors & warnings from logs and combine them with various health checks. After that
+ * you can {@link #refineIssues()} to prioritize them and filter out noise, and then {@link #logIssueSummary()}
+ * to show a human-readable list of issues.
+ *
+ * Implementation and architecture notes:
+ *
+ * We convert log messages and health check results to {@link Issue}s. They will be shown to the user at the end of
+ * the job log. To avoid overwhelming the user, we will only collect a fixed number of issues, and will de-duplicate
+ * them, so that each type of problem is shown only once.
+ *
+ * Issues will be emitted in GobblinTrackingEvents at the end of the job, so that they can be collected by Gobblin
+ * service, and used for future platform-wide analysis.
+ *
+ * */
+@Slf4j
+@Singleton
+public class AutomaticTroubleshooter extends AbstractIdleService {
+
+  private final IssueRepository issueRepository;
+  private AutoTroubleshooterLogAppender troubleshooterLogger;
+
+  @Inject
+  public AutomaticTroubleshooter(IssueRepository issueRepository) {
+    this.issueRepository = Objects.requireNonNull(issueRepository);
+  }
+
+  @Override
+  protected void startUp()
+      throws Exception {
+    setupLogAppender();
+  }
+
+  @Override
+  protected void shutDown()
+      throws Exception {
+    removeLogAppender();
+  }
+
+  private void setupLogAppender() {
+    org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
+
+    troubleshooterLogger = new AutoTroubleshooterLogAppender(issueRepository);
+    troubleshooterLogger.setThreshold(Level.WARN);
+    troubleshooterLogger.activateOptions();
+    rootLogger.addAppender(troubleshooterLogger);
+
+    log.info("Configured logger for automatic troubleshooting");
+  }
+
+  private void removeLogAppender() {
+    org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
+    rootLogger.removeAppender(troubleshooterLogger);
+    log.info("Removed logger for automatic troubleshooting. Processed {} events.",
+             troubleshooterLogger.getProcessedEventCount());
+  }
+
+  /**
+   * This method will sort, filter and enhance the list of issues to make it more meaningful for the user.
+   */
+  public void refineIssues()
+      throws TroubleshooterException {
+    IssueRefinery refinery = new DefaultIssueRefinery();

Review comment:
       This refinery logic will be significantly rebuilt in future PRs. What we have today is just a stub.

##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/AutomaticTroubleshooter.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.runtime.troubleshooter;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.text.TextStringBuilder;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.AbstractIdleService;
+
+import javax.inject.Inject;
+import javax.inject.Singleton;
+import lombok.extern.slf4j.Slf4j;
+
+
+/**
+ * Automatic troubleshooter will identify and prioritize the problems with the job, and display a summary to the user.
+ *
+ * Troubleshooter will collect errors & warnings from logs and combine them with various health checks. After that
+ * you can {@link #refineIssues()} to prioritize them and filter out noise, and then {@link #logIssueSummary()}
+ * to show a human-readable list of issues.
+ *
+ * Implementation and architecture notes:
+ *
+ * We convert log messages and health check results to {@link Issue}s. They will be shown to the user at the end of
+ * the job log. To avoid overwhelming the user, we will only collect a fixed number of issues, and will de-duplicate
+ * them, so that each type of problem is shown only once.
+ *
+ * Issues will be emitted in GobblinTrackingEvents at the end of the job, so that they can be collected by Gobblin
+ * service, and used for future platform-wide analysis.
+ *
+ * */
+@Slf4j
+@Singleton
+public class AutomaticTroubleshooter extends AbstractIdleService {
+
+  private final IssueRepository issueRepository;
+  private AutoTroubleshooterLogAppender troubleshooterLogger;
+
+  @Inject
+  public AutomaticTroubleshooter(IssueRepository issueRepository) {
+    this.issueRepository = Objects.requireNonNull(issueRepository);
+  }
+
+  @Override
+  protected void startUp()
+      throws Exception {
+    setupLogAppender();
+  }
+
+  @Override
+  protected void shutDown()
+      throws Exception {
+    removeLogAppender();
+  }
+
+  private void setupLogAppender() {
+    org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
+
+    troubleshooterLogger = new AutoTroubleshooterLogAppender(issueRepository);
+    troubleshooterLogger.setThreshold(Level.WARN);
+    troubleshooterLogger.activateOptions();
+    rootLogger.addAppender(troubleshooterLogger);
+
+    log.info("Configured logger for automatic troubleshooting");
+  }
+
+  private void removeLogAppender() {
+    org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
+    rootLogger.removeAppender(troubleshooterLogger);
+    log.info("Removed logger for automatic troubleshooting. Processed {} events.",
+             troubleshooterLogger.getProcessedEventCount());
+  }
+
+  /**
+   * This method will sort, filter and enhance the list of issues to make it more meaningful for the user.
+   */
+  public void refineIssues()
+      throws TroubleshooterException {
+    IssueRefinery refinery = new DefaultIssueRefinery();
+
+    List<Issue> issues = issueRepository.getAll();
+    issues.sort(Comparator.comparing(Issue::getSeverity).reversed().thenComparing(Issue::getTime));

Review comment:
       moved to refinery

##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/AutomaticTroubleshooter.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.runtime.troubleshooter;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.text.TextStringBuilder;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.AbstractIdleService;
+
+import javax.inject.Inject;
+import javax.inject.Singleton;
+import lombok.extern.slf4j.Slf4j;
+
+
+/**
+ * Automatic troubleshooter will identify and prioritize the problems with the job, and display a summary to the user.
+ *
+ * Troubleshooter will collect errors & warnings from logs and combine them with various health checks. After that
+ * you can {@link #refineIssues()} to prioritize them and filter out noise, and then {@link #logIssueSummary()}
+ * to show a human-readable list of issues.
+ *
+ * Implementation and architecture notes:
+ *
+ * We convert log messages and health check results to {@link Issue}s. They will be shown to the user at the end of
+ * the job log. To avoid overwhelming the user, we will only collect a fixed number of issues, and will de-duplicate
+ * them, so that each type of problem is shown only once.
+ *
+ * Issues will be emitted in GobblinTrackingEvents at the end of the job, so that they can be collected by Gobblin
+ * service, and used for future platform-wide analysis.
+ *
+ * */
+@Slf4j
+@Singleton
+public class AutomaticTroubleshooter extends AbstractIdleService {
+
+  private final IssueRepository issueRepository;
+  private AutoTroubleshooterLogAppender troubleshooterLogger;
+
+  @Inject
+  public AutomaticTroubleshooter(IssueRepository issueRepository) {
+    this.issueRepository = Objects.requireNonNull(issueRepository);
+  }
+
+  @Override
+  protected void startUp()
+      throws Exception {
+    setupLogAppender();
+  }
+
+  @Override
+  protected void shutDown()
+      throws Exception {
+    removeLogAppender();
+  }
+
+  private void setupLogAppender() {
+    org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
+
+    troubleshooterLogger = new AutoTroubleshooterLogAppender(issueRepository);
+    troubleshooterLogger.setThreshold(Level.WARN);
+    troubleshooterLogger.activateOptions();
+    rootLogger.addAppender(troubleshooterLogger);

Review comment:
       Yes. Components can log various warnings that will also be summarized and displayed to the user in Azkaban/Gobblin service. This is an opportunity for them to provide suggestion to user for non-fatal situations.
   
   Issue repository will do deduplications of all issues, and has a limit of 100 issues per job, so if we have runaway code that just writes thousands of warnings, it shouldn't impact the troubleshooter or repository.

##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/troubleshooter/AutomaticTroubleshooter.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.runtime.troubleshooter;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.text.TextStringBuilder;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.AbstractIdleService;
+
+import javax.inject.Inject;
+import javax.inject.Singleton;
+import lombok.extern.slf4j.Slf4j;
+
+
+/**
+ * Automatic troubleshooter will identify and prioritize the problems with the job, and display a summary to the user.
+ *
+ * Troubleshooter will collect errors & warnings from logs and combine them with various health checks. After that
+ * you can {@link #refineIssues()} to prioritize them and filter out noise, and then {@link #logIssueSummary()}
+ * to show a human-readable list of issues.
+ *
+ * Implementation and architecture notes:
+ *
+ * We convert log messages and health check results to {@link Issue}s. They will be shown to the user at the end of
+ * the job log. To avoid overwhelming the user, we will only collect a fixed number of issues, and will de-duplicate
+ * them, so that each type of problem is shown only once.
+ *
+ * Issues will be emitted in GobblinTrackingEvents at the end of the job, so that they can be collected by Gobblin
+ * service, and used for future platform-wide analysis.
+ *
+ * */
+@Slf4j
+@Singleton
+public class AutomaticTroubleshooter extends AbstractIdleService {
+
+  private final IssueRepository issueRepository;
+  private AutoTroubleshooterLogAppender troubleshooterLogger;
+
+  @Inject
+  public AutomaticTroubleshooter(IssueRepository issueRepository) {
+    this.issueRepository = Objects.requireNonNull(issueRepository);
+  }
+
+  @Override
+  protected void startUp()
+      throws Exception {
+    setupLogAppender();
+  }
+
+  @Override
+  protected void shutDown()
+      throws Exception {
+    removeLogAppender();
+  }

Review comment:
       yeah, most of the services in Gobblin codebase inherit from AbstractIdleService, but in this case it's not really needed. Converted to plain start/stop methods.

##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/ThrowableWithErrorCode.java
##########
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.runtime;
+
+public interface ThrowableWithErrorCode {
+
+  /**
+   * Returns an error code that uniquely identifies the problem in the given context.
+   *
+   * The error code will be used programmatically, to take different recovery actions.
+   * */
+  String getErrorCode();

Review comment:
       Added an example.

##########
File path: gobblin-runtime/src/main/java/org/apache/gobblin/runtime/mapreduce/MRJobLauncher.java
##########
@@ -737,11 +738,19 @@ private static FileSystem buildFileSystem(Properties jobProps, Configuration con
     // A list of WorkUnits (flattened for MultiWorkUnits) to be run by this mapper
     private final List<WorkUnit> workUnits = Lists.newArrayList();
 
+    private AutomaticTroubleshooter troubleshooter;
+    private IssueRepository issueRepository;
+
     @Override
     protected void setup(Context context) {
       final State gobblinJobState = HadoopUtils.getStateFromConf(context.getConfiguration());
       TaskAttemptID taskAttemptID = context.getTaskAttemptID();
 
+      issueRepository = new InMemoryIssueRepository();
+      troubleshooter = new AutomaticTroubleshooter(issueRepository);
+
+      troubleshooter.startAsync().awaitRunning();

Review comment:
       Replaced with synchronous calls.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org