Skip to content

CPU Usage 100% #2162

Closed
Closed
@leeight

Description

@leeight

It happens occasionally and the problem disappears after restart Node.js process.

  • Version: 1.23.2
  • Platform: Linux n18-035-207 4.4.0-33.bm.1-amd64 #1 SMP Wed, 24 Jan 2018 15:50:58 +0800 x86_64 GNU/Linux
  • Node.js: v8.15.0
$ strace -p PID
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 123, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 123, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
epoll_ctl(4, EPOLL_CTL_MOD, 17, {EPOLLIN, {u32=17, u64=69435872986005521}}) = 0
epoll_pwait(4, {{EPOLLIN|EPOLLHUP, {u32=17, u64=69435872986005521}}}, 1024, 122, NULL, 8) = 1
^CProcess 517 detached

$ lsof -d 17
COMMAND PID USER   FD   TYPE             DEVICE SIZE/OFF      NODE NAME
node    517 root   17u  unix 0xffff881057cb2000      0t0 760434497 socket

Activity

leeight

leeight commented on Jan 25, 2019

@leeight
Author

More info after gcore PID

$ gdb v8.15.0/bin/node coredump
(gdb) bt full
#0  0x00007fbaff2c8877 in __GI_epoll_pwait (epfd=4, events=events@entry=0x7fffffe6dde0, maxevents=maxevents@entry=1024, timeout=-13858697, timeout@entry=547, set=set@entry=0x0)
    at ../sysdeps/unix/sysv/linux/epoll_pwait.c:48
        resultvar = 140737486720528
        oldtype = 0
#1  0x00000000009c8807 in uv__io_poll (loop=loop@entry=0x2184a80 <default_loop_struct>, timeout=547) at ../deps/uv/src/unix/linux-core.c:275
        events = {{events = 17, data = {ptr = 0xf6af8f00000018, fd = 24, u32 = 24, u64 = 69435872986005528}}, {events = 1, data = {ptr = 0xf6af8fffffffff, fd = -1,
              u32 = 4294967295, u64 = 69435877280972799}}, {events = 1, data = {ptr = 0xf6af8fffffffff, fd = -1, u32 = 4294967295, u64 = 69435877280972799}}, {events = 32767,
            data = {ptr = 0x7fffffe70190, fd = -1638000, u32 = 4293329296, u64 = 140737486717328}}, {events = 4293320480, data = {ptr = 0xa9c00007fff, fd = 32767, u32 = 32767,
              u64 = 11665131208703}}, {events = 0, data = {ptr = 0x7fffffe6ff30, fd = -1638608, u32 = 4293328688, u64 = 140737486716720}}, {events = 0, data = {
              ptr = 0xffe6dee000000000, fd = 0, u32 = 0, u64 = 18439670777969115136}}, {events = 32767, data = {
              ptr = 0xf77142 <v8::internal::ParserBase<v8::internal::Parser>::ParseMemberWithNewPrefixesExpression(bool*, bool*)+50>, fd = 16216386, u32 = 16216386,
              u64 = 16216386}}, {events = 56767408, data = {ptr = 0x362340000000000, fd = 0, u32 = 0, u64 = 243814504436072448}}, {events = 0, data = {ptr = 0x7fffffe6df00,
              fd = -1646848, u32 = 4293320448, u64 = 140737486708480}}, {events = 4293328880, data = {ptr = 0xffffffff00007fff, fd = 32767, u32 = 32767,
              u64 = 18446744069414617087}}, {events = 4294967295, data = {ptr = 0x7fff00001168, fd = 4456, u32 = 4456, u64 = 140733193392488}}, {events = 4294967295, data = {
              ptr = 0xffe6ff40ffffffff, fd = -1, u32 = 4294967295, u64 = 18439706378953031679}}, {events = 32767, data = {ptr = 0x7fffffe6ffc8, fd = -1638456, u32 = 4293328840,
              u64 = 140737486716872}}, {events = 4293328704, data = {ptr = 0xffe6ff4000007fff, fd = 32767, u32 = 32767, u64 = 18439706374658097151}}, {events = 32767, data = {
              ptr = 0x3537b08, fd = 55802632, u32 = 55802632, u64 = 55802632}}, {events = 4293320480, data = {ptr = 0xf6f32300007fff, fd = 32767, u32 = 32767,
              u64 = 69510175920259071}}, {events = 0, data = {ptr = 0x7fffffe6df20, fd = -1646816, u32 = 4293320480, u64 = 140737486708512}}, {events = 4293328688, data = {
              ptr = 0xffe7019000007fff, fd = 32767, u32 = 32767, u64 = 18439708917278736383}}, {events = 32767, data = {ptr = 0xa9c, fd = 2716, u32 = 2716, u64 = 2716}}, {
            events = 0, data = {ptr = 0xffe7019000000000, fd = 0, u32 = 0, u64 = 18439708917278703616}}, {events = 32767, data = {ptr = 0x7fffffe6dfb0, fd = -1646672,
              u32 = 4293320624, u64 = 140737486708656}}, {events = 16217920, data = {ptr = 0xffe6dfc000000000, fd = 0, u32 = 0, u64 = 18439671740041789440}}, {events = 32767,
            data = {ptr = 0xf77740 <v8::internal::ParserBase<v8::internal::Parser>::ParseLeftHandSideExpression(bool*)+48>, fd = 16217920, u32 = 16217920, u64 = 16217920}}, {
            events = 4293328704, data = {ptr = 0xffe701c000007fff, fd = 32767, u32 = 32767, u64 = 18439709123437166591}}, {events = 32767, data = {ptr = 0x7fffffe6df90,
              fd = -1646704, u32 = 4293320592, u64 = 140737486708624}}, {events = 57364256, data = {ptr = 0x3625b0000000000, fd = 0, u32 = 0, u64 = 243857385389555712}}, {
            events = 0, data = {ptr = 0xad3a5d <v8::internal::AstRawString::Compare(void*, void*)+189>, fd = 11352669, u32 = 11352669, u64 = 11352669}}, {events = 4293320624,
            data = {ptr = 0xad473400007fff, fd = 32767, u32 = 32767, u64 = 48773459634847743}}, {events = 0, data = {ptr = 0x36524f0, fd = 56960240, u32 = 56960240,
              u64 = 56960240}}, {events = 56777696, data = {ptr = 0xffe6df6000000001, fd = 1, u32 = 1, u64 = 18439671327724929025}}, {events = 32767, data = {ptr = 0x6, fd = 6,
              u32 = 6, u64 = 6}}, {events = 0, data = {ptr = 0x36524f000000000, fd = 0, u32 = 0, u64 = 244642367972311040}}, {events = 0, data = {ptr = 0x6, fd = 6, u32 = 6,
              u64 = 6}}, {events = 159150314, data = {ptr = 0xdc900000001, fd = 1, u32 = 1, u64 = 15156939587585}}, {events = 0, data = {ptr = 0x3661050, fd = 57020496,
              u32 = 57020496, u64 = 57020496}}, {events = 56960240, data = {ptr = 0x36225e800000000, fd = 0, u32 = 0, u64 = 243799008194068480}}, {events = 0, data = {ptr = 0xab0,
              fd = 2736, u32 = 2736, u64 = 2736}}, {events = 4293328688, data = {ptr = 0xffe6e01000007fff, fd = 32767, u32 = 32767, u64 = 18439672083639205887}}, {events = 32767,
            data = {ptr = 0xad4902 <v8::internal::AstValueFactory::GetOneByteStringInternal(v8::internal::Vector<unsigned char const>)+114>, fd = 11356418, u32 = 11356418,
              u64 = 11356418}}, {events = 0, data = {ptr = 0x600000000, fd = 0, u32 = 0, u64 = 25769803776}}, {events = 0, data = {ptr = 0xfeb4f46800000006, fd = 6, u32 = 6,
              u64 = 18353563108955389958}}, {events = 0, data = {ptr = 0xdc900000100, fd = 256, u32 = 256, u64 = 15156939587840}}, {events = 0, data = {ptr = 0x7fffffe70190,
              fd = -1638000, u32 = 4293329296, u64 = 140737486717328}}, {events = 86, data = {ptr = 0x36225e800000000, fd = 0, u32 = 0, u64 = 243799008194068480}}, {events = 0,
            data = {ptr = 0xab0, fd = 2736, u32 = 2736, u64 = 2736}}, {events = 4293328688, data = {ptr = 0xffe6e02000007fff, fd = 32767, u32 = 32767, u64 = 18439672152358682623}},
          {events = 32767, data = {ptr = 0xfa98b8 <v8::internal::Scanner::CurrentSymbol(v8::internal::AstValueFactory*) const+40>, fd = 16423096, u32 = 16423096, u64 = 16423096}}, {
            events = 4293320848, data = {ptr = 0xf60f9100007fff, fd = 32767, u32 = 32767, u64 = 69259959715528703}}, {events = 0, data = {ptr = 0x1, fd = 1, u32 = 1, u64 = 1}}, {
            events = 4293329296, data = {ptr = 0x7fff, fd = 32767, u32 = 32767, u64 = 32767}}, {events = 0, data = {ptr = 0x7fffffe70190, fd = -1638000, u32 = 4293329296,
              u64 = 140737486717328}}, {events = 4293320944, data = {ptr = 0xf7891f00007fff, fd = 32767, u32 = 32767, u64 = 69675085484556287}}, {events = 0, data = {ptr = 0x0,
              fd = 0, u32 = 0, u64 = 0}}, {events = 4293329296, data = {ptr = 0xffe7021800007fff, fd = 32767, u32 = 32767, u64 = 18439709501394288639}}, {events = 32767, data = {
              ptr = 0x7fffffe70190, fd = -1638000, u32 = 4293329296, u64 = 140737486717328}}, {events = 4293329296, data = {ptr = 0x362371000007fff, fd = 32767, u32 = 32767,
              u64 = 243817871690465279}}, {events = 0, data = {ptr = 0x7fffffe6e110, fd = -1646320, u32 = 4293320976, u64 = 140737486709008}}, {events = 16184099, data = {
              ptr = 0xa2900000000, fd = 0, u32 = 0, u64 = 11171209936896}}, {events = 0, data = {ptr = 0x7fffffe6ff30, fd = -1638608, u32 = 4293328688, u64 = 140737486716720}}, {
            events = 4293320976, data = {ptr = 0xad490200007fff, fd = 32767, u32 = 32767, u64 = 48775443909738495}}, {events = 0, data = {ptr = 0x7fffffe6fce0, fd = -1639200,
leeight

leeight commented on Jan 25, 2019

@leeight
Author
$ strace -p PID -c
Process 517 attached
^CProcess 517 detached
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 54.62    0.000739           0     48488           epoll_pwait
 45.38    0.000614           0     48491           epoll_ctl
  0.00    0.000000           0         2           write
------ ----------- ----------- --------- --------- ----------------
100.00    0.001353                 96981           total
saghul

saghul commented on Jan 25, 2019

@saghul
Member

Do you have a (small) test case to reproduce this?

leeight

leeight commented on Jan 25, 2019

@leeight
Author

@saghul Not yet. if i can reproduce it, i'll post it asap.

leeight

leeight commented on Jan 30, 2019

@leeight
Author

@saghul The following code can reproduce it.

client.js

const net = require('net');
const sock = '/tmp/example.sock';

const socket = new net.Socket();

socket.pause();

socket.on('end', () => {
    console.info('end');
});
socket.on('close', () => {
    console.info('close');
});
socket.on('error', (err) => {
    console.error(err);
});
socket.on('connect', () => {
    console.info('connect');
});

socket.connect(sock);

setInterval(() => {
    console.info('interval');
}, 10000);

server.js

const net = require('net');
const fs = require('fs');

const sock = '/tmp/example.sock';

if (fs.existsSync(sock)) {
    fs.unlinkSync(sock);
}
net.createServer().listen(sock).on('connection', function(c) {
    console.info('client connected');    
});

Reproduce steps:

  1. node server.js
  2. node client.js
  3. kill the server.js

Now the cpu usage is 100%

santigimeno

santigimeno commented on Jan 30, 2019

@santigimeno
Member

It seems the issue is that, when paused, node doesn't read from the socket so the EOF cannot be detected. The reason for this is described in nodejs/node-v0.x-archive#8200, though tbh, I'm not sure about it. I would raise the issue in the nodejs repository and see how it goes.

leeight

leeight commented on Jan 31, 2019

@leeight
Author

@santigimeno Ok, thanks

leeight

leeight commented on Jan 31, 2019

@leeight
Author
oyyd

oyyd commented on Jan 31, 2019

@oyyd
Contributor

It seems that if we don't call uv_read_start() on a pipe socket, the epoll_pwait inside of uv__io_poll won't block the thread so that it will loop too frequently. For example, the client below will have high cpu usage if we don't call uv_read_start():

server:

#include <stdio.h>
#include <stdlib.h>
#include <uv.h>
#include <time.h>

void gen_random(char *s, const int len) {
    static const char alphanum[] =
        "0123456789"
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz";

    for (int i = 0; i < len; ++i) {
        s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
    }

    s[len] = 0;
}

static uv_loop_t loop;
static uv_pipe_t conn;

static void write_cb(uv_write_t* req, int status) {
  free(req->data);
  free(req);
}

static void pipe_server_connection_cb(uv_stream_t* handle, int status) {
  uv_pipe_init(&loop, &conn, 0);
  uv_accept(handle, (uv_stream_t*)&conn);

  uv_write_t* write_req = malloc(sizeof(uv_write_t));
  uv_buf_t* buf = malloc(sizeof(uv_buf_t));
  buf->base = "hello";
  buf->len = 5;
  write_req->data = buf;
  uv_write(write_req, (uv_stream_t*)&conn, buf, 1, write_cb);
}

int main() {
  srand(time(NULL));

  char* tmpdir = "/var/folders/8m/k27w6hhd0058d53njmb5vz8c0000gp/T/";

  char random_name[60];
  strcpy(random_name, tmpdir);
  gen_random(random_name + 49, 6);

  uv_pipe_t handle;

  uv_loop_init(&loop);
  uv_pipe_init(&loop, &handle, 0);

  uv_pipe_bind(&handle, random_name);

  uv_listen((uv_stream_t*) &handle, 0, pipe_server_connection_cb);

  fprintf(stderr, "%s\n", random_name);

  uv_run(&loop, UV_RUN_DEFAULT);

  return 0;
}

client:

#include <stdio.h>
#include <uv.h>
#include <stdlib.h>

static uv_loop_t loop;
static uv_pipe_t handle;
static uv_connect_t conn_req;
static uv_timer_t timer;

static void alloc_cb(uv_handle_t* handle, size_t size, uv_buf_t* buf) {
  static char base[1];

  buf->base = base;
  buf->len = sizeof(base);
}

static void read_cb(uv_stream_t* handle,
                    ssize_t nread,
                    const uv_buf_t* buf) {
  if (nread > 0) {
    fprintf(stderr, "nread %ld %s\n", nread, buf->base);
  }
}

static void connect_cb(uv_connect_t* connect_req, int status) {
  fprintf(stderr, "connected: %d", status);
  // NOTE Don't call uv_read_start.
  // uv_read_start((uv_stream_t*)&handle, alloc_cb, read_cb);
}

static void walk_cb(uv_handle_t* handle, void* arg) {
  fprintf(stderr, "handle_type: %s\n", uv_handle_type_name(handle->type));
}

static void timer_cb(uv_timer_t* timer) {
  uv_walk(&loop, walk_cb, NULL);
}

int main(int argc, char* argv[]) {
  char* socket_name = argv[1];

  uv_loop_init(&loop);
  uv_pipe_init(&loop, &handle, 0);
  uv_pipe_connect(&conn_req, &handle, socket_name, connect_cb);

  // A timer as an ref.
  uv_timer_init(&loop, &timer);
  uv_timer_start(&timer, timer_cb, 2000, 2000);

  uv_run(&loop, UV_RUN_DEFAULT);
  return 0;
}

As it can be reproduced on uv so that I guess it's not simply an issue of node.

tsyeyuanfeng

tsyeyuanfeng commented on Jan 31, 2019

@tsyeyuanfeng

uv_pipe_connect registers POLLIN event at io watcher right after pipe is connected, but at this moment stream->read_cb and stream->flags is not set.

// src/unix/pipe.c

void uv_pipe_connect(uv_connect_t* req,
                    uv_pipe_t* handle,
                    const char* name,
                    uv_connect_cb cb) {
  if (err == 0)
    uv__io_start(handle->loop, &handle->io_watcher, POLLIN | POLLOUT);
}

If socket receives data from peer, epoll will trigger POLLIN event. However the related io watcher will fail to read socket, for stream->cb is null and stream->flags is zero.

// src/unix/stream.c

static void uv__read(uv_stream_t* stream) {
  uv_buf_t buf;
  ssize_t nread;
  struct msghdr msg;
  char cmsg_space[CMSG_SPACE(UV__CMSG_FD_SIZE)];
  int count;
  int err;
  int is_ipc;

  stream->flags &= ~UV_HANDLE_READ_PARTIAL;

  /* Prevent loop starvation when the data comes in as fast as (or faster than)
   * we can read it. XXX Need to rearm fd if we switch to edge-triggered I/O.
   */
  count = 32;

  is_ipc = stream->type == UV_NAMED_PIPE && ((uv_pipe_t*) stream)->ipc;

  /* XXX: Maybe instead of having UV_HANDLE_READING we just test if
   * tcp->read_cb is NULL or not?
   */
  while (stream->read_cb
      && (stream->flags & UV_HANDLE_READING)
      && (count-- > 0)) {
    assert(stream->alloc_cb != NULL);

    buf = uv_buf_init(NULL, 0);
    stream->alloc_cb((uv_handle_t*)stream, 64 * 1024, &buf);
    if (buf.base == NULL || buf.len == 0) {
      /* User indicates it can't or won't handle the read. */
      stream->read_cb(stream, UV_ENOBUFS, &buf);
      return;
    }

    assert(buf.base != NULL);
    assert(uv__stream_fd(stream) >= 0);

    if (!is_ipc) {
      do {
        nread = read(uv__stream_fd(stream), buf.base, buf.len);
      }
      while (nread < 0 && errno == EINTR);
    } else {
      /* ipc uses recvmsg */
      msg.msg_flags = 0;
      msg.msg_iov = (struct iovec*) &buf;
      msg.msg_iovlen = 1;
      msg.msg_name = NULL;
      msg.msg_namelen = 0;
      /* Set up to receive a descriptor even if one isn't in the message */
      msg.msg_controllen = sizeof(cmsg_space);
      msg.msg_control = cmsg_space;

      do {
        nread = uv__recvmsg(uv__stream_fd(stream), &msg, 0);
      }
      while (nread < 0 && errno == EINTR);
    }
    // ...omit something
  }
}

Since data is not read, epoll keep triggering POLLIN event, which will lead to event loop keep running fast.

In contrast, uv__tcp_connect doesn't register POLLIN event at the very start, so tcp socket won't trigger this issue.

int uv__tcp_connect(uv_connect_t* req,
                    uv_tcp_t* handle,
                    const struct sockaddr* addr,
                    unsigned int addrlen,
                    uv_connect_cb cb) {
  uv__io_start(handle->loop, &handle->io_watcher, POLLOUT);
}

So I think it is reasonable to remove the POLLIN flag at uv_pipe_connect to resolve this issue.

santigimeno

santigimeno commented on Jan 31, 2019

@santigimeno
Member

So I think it is reasonable to remove the POLLIN flag at uv_pipe_connect to resolve this issue.

Yes, it seems reasonable. Good find. Could you open a PR? Thanks

tsyeyuanfeng

tsyeyuanfeng commented on Jan 31, 2019

@tsyeyuanfeng

So I think it is reasonable to remove the POLLIN flag at uv_pipe_connect to resolve this issue.

Yes, it seems reasonable. Good find. Could you open a PR? Thanks

Ok

17 remaining items

Loading
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions

      CPU Usage 100% · Issue #2162 · libuv/libuv