Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Log warning when Major GC fails to recover significant memory
Browse files Browse the repository at this point in the history
There are instances where processes spin in GC loops when Major GCs fail
to recover enough memory. By providing a warning before this condition
is reached, it should be possible to act on the warning by taking a JFR
before the JVM completely runs out of memory.
wweiss-starburst committed Aug 20, 2024
1 parent 7c6603a commit 8eec730
Showing 3 changed files with 118 additions and 20 deletions.
122 changes: 104 additions & 18 deletions stats/src/main/java/io/airlift/stats/JmxGcMonitor.java
Original file line number Diff line number Diff line change
@@ -13,7 +13,9 @@
*/
package io.airlift.stats;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import io.airlift.log.Logger;
import io.airlift.units.DataSize;
@@ -35,11 +37,14 @@
import java.lang.management.MemoryUsage;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.sun.management.GarbageCollectionNotificationInfo.GARBAGE_COLLECTION_NOTIFICATION;
import static io.airlift.units.DataSize.succinctBytes;
import static java.lang.Math.max;
import static java.lang.management.ManagementFactory.getPlatformMBeanServer;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;

@@ -55,13 +60,21 @@
public class JmxGcMonitor
implements GcMonitor
{
@VisibleForTesting
static final double FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN = 0.8;
private static final double MINIMUM_PERCENTAGE_OF_HEAP_RECLAIMED = 10.0;
private static final String MAJOR_ZGC_NAME = "ZGC Major Cycles";
private static final Set<String> ZGC_MBEANS = ImmutableSet.of("ZGC Cycles", MAJOR_ZGC_NAME, "ZGC Minor Cycles");

private final Logger log = Logger.get(JmxGcMonitor.class);

private final NotificationListener notificationListener = (notification, ignored) -> onNotification(notification);
private final NotificationListener zgcNotificationListener = (notification, ignored) -> onZgcNotification(notification);

private final AtomicLong majorGcCount = new AtomicLong();
private final AtomicLong majorGcTime = new AtomicLong();
private final TimeStat majorGc = new TimeStat();
private boolean zgcSupportsMajor;

private final TimeStat minorGc = new TimeStat();

@@ -74,11 +87,22 @@ public void start()
for (GarbageCollectorMXBean mbean : ManagementFactory.getGarbageCollectorMXBeans()) {
ObjectName objectName = mbean.getObjectName();
try {
ManagementFactory.getPlatformMBeanServer().addNotificationListener(
objectName,
notificationListener,
null,
null);
// Don't register listeners for ZGC Pauses because it does not report memory usage statistics
if (ZGC_MBEANS.contains(mbean.getName())) {
zgcSupportsMajor |= mbean.getName().startsWith("ZGC Major");
getPlatformMBeanServer().addNotificationListener(
objectName,
zgcNotificationListener,
null,
null);
}
else {
getPlatformMBeanServer().addNotificationListener(
objectName,
notificationListener,
null,
null);
}
}
catch (JMException e) {
throw new RuntimeException("Unable to add GC listener", e);
@@ -92,7 +116,14 @@ public void stop()
for (GarbageCollectorMXBean mbean : ManagementFactory.getGarbageCollectorMXBeans()) {
ObjectName objectName = mbean.getObjectName();
try {
ManagementFactory.getPlatformMBeanServer().removeNotificationListener(objectName, notificationListener);
if (mbean.getName().startsWith("ZGC")) {
if (!mbean.getName().equals("ZGC Pauses")) {
getPlatformMBeanServer().removeNotificationListener(objectName, zgcNotificationListener);
}
}
else {
getPlatformMBeanServer().removeNotificationListener(objectName, notificationListener);
}
}
catch (JMException ignored) {
log.warn("Failed to unregister GC listener");
@@ -128,26 +159,34 @@ public TimeStat getMinorGc()

private synchronized void onNotification(Notification notification)
{
if ("com.sun.management.gc.notification".equals(notification.getType())) {
if (GARBAGE_COLLECTION_NOTIFICATION.equals(notification.getType())) {
GarbageCollectionNotificationInfo info = new GarbageCollectionNotificationInfo((CompositeData) notification.getUserData());

if (info.isMajorGc()) {
majorGcCount.incrementAndGet();
majorGcTime.addAndGet(info.getDurationMs());
majorGc.add(info.getDurationMs(), MILLISECONDS);

// assume that major GCs stop the application
long applicationRuntime = max(0, info.getStartTime() - lastGcEndTime);
lastGcEndTime = info.getEndTime();
logMajorGc(info);
}
else if (info.isMinorGc()) {
minorGc.add(info.getDurationMs(), MILLISECONDS);

log.info(
"Major GC: application %sms, stopped %sms: %s -> %s",
applicationRuntime,
// assumption that minor GCs run currently, so we do not print stopped or application time
log.debug(
"Minor GC: duration %sms: %s -> %s",
info.getDurationMs(),
info.getBeforeGcTotal(),
info.getAfterGcTotal());
}
else if (info.isMinorGc()) {
}
}

private synchronized void onZgcNotification(Notification notification)
{
if (GARBAGE_COLLECTION_NOTIFICATION.equals(notification.getType())) {
GarbageCollectionNotificationInfo info = new GarbageCollectionNotificationInfo((CompositeData) notification.getUserData());
// NOTE: When ZGC is used without -XX:+ZGenerational, the log becomes noisy
if (!zgcSupportsMajor || info.isMajorZgc()) {
logMajorGc(info);
}
else {
minorGc.add(info.getDurationMs(), MILLISECONDS);

// assumption that minor GCs run currently, so we do not print stopped or application time
@@ -160,6 +199,48 @@ else if (info.isMinorGc()) {
}
}

private void logMajorGc(GarbageCollectionNotificationInfo info)
{
majorGcCount.incrementAndGet();
majorGcTime.addAndGet(info.getDurationMs());
majorGc.add(info.getDurationMs(), MILLISECONDS);

// assume that major GCs stop the application
long applicationRuntime = max(0, info.getStartTime() - lastGcEndTime);
lastGcEndTime = info.getEndTime();

double percentHeapReclaimed = percentOfMaxHeapReclaimed(
ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax(),
info.getBeforeGcTotal().toBytes(),
info.getAfterGcTotal().toBytes());
if (percentHeapReclaimed < MINIMUM_PERCENTAGE_OF_HEAP_RECLAIMED) {
log.warn(
"Major GC: application %sms, stopped %sms: %s -> %s (only recovered %.1f%% of heap memory)",
applicationRuntime,
info.getDurationMs(),
info.getBeforeGcTotal(),
info.getAfterGcTotal(),
percentHeapReclaimed);
return;
}

log.info(
"Major GC: application %sms, stopped %sms: %s -> %s",
applicationRuntime,
info.getDurationMs(),
info.getBeforeGcTotal(),
info.getAfterGcTotal());
}

static double percentOfMaxHeapReclaimed(long maxHeapMemoryUsage, double totalBeforeGcMemory, double totalAfterGcMemory)
{
if (maxHeapMemoryUsage <= 0 || (totalBeforeGcMemory / maxHeapMemoryUsage) < FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN) {
// There is no max heap defined or the heap is not close to the max yet
return 100.0;
}
return (1 - (totalAfterGcMemory / totalBeforeGcMemory)) * 100;
}

private static class GarbageCollectionNotificationInfo
{
// these are well known constants used by all known OpenJDK GCs
@@ -234,6 +315,11 @@ public boolean isMajorGc()
return gcAction.equalsIgnoreCase(MAJOR_GC_NAME);
}

public boolean isMajorZgc()
{
return gcName.equalsIgnoreCase(MAJOR_ZGC_NAME);
}

@Override
public String toString()
{
4 changes: 2 additions & 2 deletions stats/src/test/java/io/airlift/stats/GcMonitorTester.java
Original file line number Diff line number Diff line change
@@ -24,14 +24,14 @@
* <p>
* Test should always be run with:
* <pre>
* {@code -Xmx1g -Xms1g -XX:+PrintGCApplicationStoppedTime}
* {@code -Xmx1g -Xms1g}
* </pre>
* All GC algorithms should be tested:
* <ul>
* <li>Serial: {@code -XX:+UseSerialGC }</li>
* <li>Parallel: {@code -XX:+UseParallelGC}</li>
* <li>CMS: {@code -XX:+UseConcMarkSweepGC}</li>
* <li>G1: {@code -XX:+UseG1GC}</li>
* <li>Z: {@code -XX:+UseZGC -XX:+ZGenerational}</li>
* </ul>
* Verifying stopped time in GC log to the stopped time from GCMonitor.
*/
12 changes: 12 additions & 0 deletions stats/src/test/java/io/airlift/stats/TestJmxGcMonitor.java
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import io.airlift.units.Duration;
import org.junit.jupiter.api.Test;

import static io.airlift.stats.JmxGcMonitor.FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN;
import static io.airlift.testing.Assertions.assertGreaterThanOrEqual;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static org.assertj.core.api.Assertions.assertThat;
@@ -25,4 +26,15 @@ public void test()
gcMonitor.stop();
}
}

@Test
public void testWarnTrigger()
{
// Don't warn when max heap is not specified
assertThat(JmxGcMonitor.percentOfMaxHeapReclaimed(-1, 100.0, 100.0)).isEqualTo(100.0);
// Don't warn when totalBeforeGcMemory / maxHeapMemoryUsage < FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN
assertThat(JmxGcMonitor.percentOfMaxHeapReclaimed(100, FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN * 100 - 1, 300)).isEqualTo(100.0);
// Return fraction of heap used when totalBeforeGcMemory / maxHeapMemoryUsage >= FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN
assertThat(JmxGcMonitor.percentOfMaxHeapReclaimed(100, FRACTION_OF_MAX_HEAP_TO_TRIGGER_WARN * 100, 300)).isLessThan(100.0);
}
}

0 comments on commit 8eec730

Please sign in to comment.