Skip to content

Commit

Permalink
Merge pull request kubernetes#22 from Random-Liu/add-look-back
Browse files Browse the repository at this point in the history
Kernel Monitor: Add look back support and kernel panic handling
  • Loading branch information
dchen1107 authored Aug 24, 2016
2 parents 92a3c35 + 532f933 commit ea83111
Show file tree
Hide file tree
Showing 12 changed files with 249 additions and 114 deletions.
12 changes: 12 additions & 0 deletions config/kernel-monitor.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"logPath": "/log/kern.log",
"lookback": "10m",
"startPattern": "Initializing cgroup subsys cpuset",
"bufferSize": 10,
"source": "kernel-monitor",
"conditions": [
Expand All @@ -20,6 +22,16 @@
"reason": "TaskHung",
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "temporary",
"reason": "KernelPanic",
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
},
{
"type": "temporary",
"reason": "KernelPanic",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
Expand Down
4 changes: 2 additions & 2 deletions demo/demo
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ rebootAll() {

reboot() {
LATEST=`gcloud compute ssh -n root@$1 --zone=$ZONE "tail -1 $KERNLOG"`
PREFIX=`echo $LATEST | cut -d "[" -f 1 -`"[0.000000]"
runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX reboot' >> $KERNLOG\""
PREFIX=`echo $LATEST | cut -d "]" -f 1 -`"]"
runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX Initializing cgroup subsys cpuset' >> $KERNLOG\""
}

case $1 in
Expand Down
36 changes: 36 additions & 0 deletions demo/divide_zero
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
divide error: 0000 [#1] SMP
Modules linked in: dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio libcrc32c xt_statistic xt_nat xt_mark ipt_REJECT xt_tcpudp xt_comment loop veth binfmt_misc sch_htb ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc crc32_pclmul ppdev aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd evdev psmouse serio_raw parport_pc ttm parport drm_kms_helper drm i2c_piix4 i2c_core processor button thermal_sys autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod ata_generic crct10dif_pclmul crct10dif_common xen_netfront xen_blkfront crc32c_intel ata_piix libata scsi_mod floppy
CPU: 1 PID: 1853 Comm: docker Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u4
Hardware name: Xen HVM domU, BIOS 4.2.amazon 05/12/2016
task: ffff8801e3657470 ti: ffff8801e47a8000 task.ti: ffff8801e47a8000
RIP: 0010:[<ffffffffa0577200>] [<ffffffffa0577200>] pool_io_hints+0xf0/0x1a0 [dm_thin_pool]
RSP: 0018:ffff8801e47abbc8 EFLAGS: 00010246
RAX: 0000000000010000 RBX: ffff8801e4736840 RCX: ffff8801c2662000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8801e48c4080
RBP: ffff8801e47abc10 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffa057f5c8
R13: 0000000000000001 R14: ffff8801e47abc90 R15: 0000000000000131
FS: 00007ff465daf700(0000) GS:ffff8801efc20000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000c207f1c3fb CR3: 00000001e2a5a000 CR4: 00000000001406e0
Stack:
ffffffff810a7c71 0000000043e06d70 ffffc9000115f040 0000000000000000
0000000043e06d70 ffffc9000115f040 0000000000000000 ffff8800e9da3800
ffffffffa00ba615 000fffffffffffff 00000000ffffffff 00000000000000ff
Call Trace:
[<ffffffff810a7c71>] ? complete+0x31/0x40
[<ffffffffa00ba615>] ? dm_calculate_queue_limits+0x95/0x130 [dm_mod]
[<ffffffffa00b7ec3>] ? dm_swap_table+0x73/0x320 [dm_mod]
[<ffffffffa00b0101>] ? crc_t10dif_generic+0x101/0x1000 [crct10dif_common]
[<ffffffffa00bd0d0>] ? table_load+0x330/0x330 [dm_mod]
[<ffffffffa00bd165>] ? dev_suspend+0x95/0x220 [dm_mod]
[<ffffffffa00bda55>] ? ctl_ioctl+0x205/0x430 [dm_mod]
[<ffffffffa00bdc8f>] ? dm_ctl_ioctl+0xf/0x20 [dm_mod]
[<ffffffff811ba99f>] ? do_vfs_ioctl+0x2cf/0x4b0
[<ffffffff810d485e>] ? SyS_futex+0x6e/0x150
[<ffffffff811bac01>] ? SyS_ioctl+0x81/0xa0
[<ffffffff81513ecd>] ? system_call_fast_compare_end+0x10/0x15
Code: 0f 84 a5 00 00 00 3b 96 10 06 00 00 49 c7 c4 c8 f5 57 a0 77 26 8b b6 18 06 00 00 89 d0 c1 e0 09 48 39 f0 0f 82 92 00 00 00 31 d2 <48> f7 f6 85 d2 74 2d 49 c7 c4 70 f5 57 a0 66 90 48 89 e6 e8 28
RIP [<ffffffffa0577200>] pool_io_hints+0xf0/0x1a0 [dm_thin_pool]
RSP <ffff8801e47abbc8>
---[ end trace fcce781faebae9ce ]---
36 changes: 36 additions & 0 deletions demo/null_pointer
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
BUG: unable to handle kernel NULL pointer dereference at 0000000000000078
IP: [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
PGD 0
Oops: 0000 [#1] SMP
Modules linked in: binfmt_misc ipt_REJECT xt_nat xt_tcpudp xt_multiport veth xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) cpufreq_conservative cpufreq_powersave cpufreq_stats cpufreq_userspace xfs libcrc32c x86_pkg_temp_thermal intel_powerclamp intel_rapl coretemp kvm_intel kvm crc32_pclmul iTCO_wdt iTCO_vendor_support aesni_intel aes_x86_64 lrw gf128mul ttm glue_helper evdev ablk_helper cryptd drm_kms_helper drm serio_raw shpchp i2c_i801 lpc_ich mfd_core wmi ipmi_msghandler processor tpm_infineon thermal_sys tpm_tis tpm acpi_pad button autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod raid1 md_mod sg sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul crct10dif_common crc32c_intel ehci_pci ahci igb libahci xhci_hcd ehci_hcd i2c_algo_bit i2c_core libata dca ptp usbcore scsi_mod pps_core usb_common
CPU: 11 PID: 63 Comm: ksoftirqd/11 Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u3
Hardware name: ASUSTeK COMPUTER INC. Z10PA-U8 Series/Z10PA-U8 Series, BIOS 0601 06/26/2015
task: ffff881fe1d0cb60 ti: ffff881fe1d18000 task.ti: ffff881fe1d18000
RIP: 0010:[<ffffffff810a2c38>] [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
RSP: 0018:ffff881fe1d1bde0 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff881fa84a3580 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff881fb5837628 RDI: ffff881d1fab2308
RBP: ffff881fb5837600 R08: 0000000000000000 R09: 000000000000b9dc
R10: 0000000000000004 R11: 0000000000000005 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: ffff88207fd72f00
FS: 0000000000000000(0000) GS:ffff88207fd60000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000078 CR3: 0000000001813000 CR4: 00000000001407e0
Stack:
ffff881d1fab2290 000000018109ef94 ffff881fe1d0cb60 0000000000012f00
ffff88207fd72f78 ffffffff8101b975 ffff881fe1d0cfb8 ffff881fe1d0cb60
ffff88207fd72f00 000000000000000b 0000000000000000 0000000000000000
Call Trace:
[<ffffffff8101b975>] ? sched_clock+0x5/0x10
[<ffffffff8150fed6>] ? __schedule+0x106/0x700
[<ffffffff8108ea86>] ? smpboot_thread_fn+0xc6/0x190
[<ffffffff8108e9c0>] ? SyS_setgroups+0x170/0x170
[<ffffffff8108805d>] ? kthread+0xbd/0xe0
[<ffffffff81087fa0>] ? kthread_create_on_node+0x180/0x180
[<ffffffff81513c58>] ? ret_from_fork+0x58/0x90
[<ffffffff81087fa0>] ? kthread_create_on_node+0x180/0x180
Code: 49 8b 7c 24 78 48 39 fd 74 2f 44 8b 73 68 45 8b 6c 24 68 45 39 ee 0f 8e c7 00 00 00 48 89 ef 48 89 de e8 fc 91 ff ff 48 8b 5b 70 <49> 8b 7c 24 78 48 8b 6b 78 48 39 fd 75 d1 48 85 ed 74 cc 4c 89
RIP [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
RSP <ffff881fe1d1bde0>
CR2: 0000000000000078
---[ end trace 61f6991dc9ee5be0 ]---
2 changes: 1 addition & 1 deletion node_problem_detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
)

var (
kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor", "The path to the kernel monitor config file")
kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor.json", "The path to the kernel monitor config file")
)

func main() {
Expand Down
49 changes: 33 additions & 16 deletions pkg/kernelmonitor/kernel_log_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ package kernelmonitor

import (
"bufio"
"fmt"
"os"
"time"

"k8s.io/node-problem-detector/pkg/kernelmonitor/translator"
"k8s.io/node-problem-detector/pkg/kernelmonitor/types"
"k8s.io/node-problem-detector/pkg/kernelmonitor/util"

"github.com/golang/glog"
"github.com/hpcloud/tail"
utilclock "github.com/pivotal-golang/clock"
)

const (
Expand All @@ -34,7 +37,12 @@ const (

// WatcherConfig is the configuration of kernel log watcher.
type WatcherConfig struct {
// KernelLogPath is the path to the kernel log
KernelLogPath string `json:"logPath, omitempty"`
// StartPattern is the pattern of the start line
StartPattern string `json:"startPattern, omitempty"`
// Lookback is the time kernel watcher looks up
Lookback string `json:"lookback, omitempty"`
}

// KernelLogWatcher watches and translates the kernel log. Once there is new log line,
Expand All @@ -53,6 +61,7 @@ type kernelLogWatcher struct {
tl *tail.Tail
logCh chan *types.KernelLog
tomb *util.Tomb
clock utilclock.Clock
}

// NewKernelLogWatcher creates a new kernel log watcher.
Expand All @@ -63,6 +72,7 @@ func NewKernelLogWatcher(cfg WatcherConfig) KernelLogWatcher {
tomb: util.NewTomb(),
// A capacity 1000 buffer should be enough
logCh: make(chan *types.KernelLog, 1000),
clock: utilclock.NewClock(),
}
}

Expand Down Expand Up @@ -144,39 +154,46 @@ func (k *kernelLogWatcher) watchLoop() {
}
}

// getStartPoint parses the newest kernel log file and try to find the latest reboot point.
// Currently we rely on the kernel log timestamp to find the reboot point. The basic idea
// is straight forward: In the whole lifecycle of a node, the kernel log timestamp should
// always increase, only when it is reboot, the timestamp will decrease. We just parse the
// log and find the latest timestamp decreasing, then it should be the latest reboot point.
// TODO(random-liu): A drawback is that if the node is started long time ago, we'll only get
// logs in the newest kernel log file. We may want to improve this in the future.
// getStartPoint finds the start point to parse the log. The start point is either
// the line at (now - lookback) or the first line of kernel log.
// Notice that, kernel log watcher doesn't look back to the rolled out logs.
func (k *kernelLogWatcher) getStartPoint(path string) (int64, error) {
f, err := os.Open(path)
if err != nil {
return -1, err
return 0, fmt.Errorf("failed to open file %q: %v", path, err)
}
defer f.Close()
lookback, err := parseDuration(k.cfg.Lookback)
if err != nil {
return 0, fmt.Errorf("failed to parse duration %q: %v", k.cfg.Lookback, err)
}
start := int64(0)
total := 0
lastTimestamp := int64(0)
reader := bufio.NewReader(f)
done := false
for !done {
line, err := reader.ReadString('\n')
if err != nil {
if len(line) == 0 {
// No need to continue parsing if nothing is read
break
}
done = true
}
total += len(line)
log, err := k.trans.Translate(line)
if err != nil {
glog.Infof("unable to parse line: %q, %v", line, err)
continue
}
if log.Timestamp < lastTimestamp {
start = int64(total - len(line))
} else if k.clock.Since(log.Timestamp) <= lookback {
break
}
lastTimestamp = log.Timestamp
start += int64(len(line))
}
return start, nil
}

func parseDuration(s string) (time.Duration, error) {
// If the duration is not configured, just return 0 by default
if s == "" {
return 0, nil
}
return time.ParseDuration(s)
}
57 changes: 41 additions & 16 deletions pkg/kernelmonitor/kernel_log_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,50 +24,73 @@ import (
"time"

"k8s.io/node-problem-detector/pkg/kernelmonitor/types"

"github.com/pivotal-golang/clock/fakeclock"
)

func TestGetStartPoint(t *testing.T) {
// now is a fake time
now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local)
fakeClock := fakeclock.NewFakeClock(now)
testCases := []struct {
log string
logs []types.KernelLog
err bool
log string
logs []types.KernelLog
lookback string
}{
{
// The start point is at the head of the log file.
log: `kernel: [1.000000] 1
kernel: [2.000000] 2
kernel: [3.000000] 3
log: `Jan 2 03:04:05 kernel: [0.000000] 1
Jan 2 03:04:06 kernel: [1.000000] 2
Jan 2 03:04:07 kernel: [2.000000] 3
`,
logs: []types.KernelLog{
{
Timestamp: 1000000,
Timestamp: now,
Message: "1",
},
{
Timestamp: 2000000,
Timestamp: now.Add(time.Second),
Message: "2",
},
{
Timestamp: 3000000,
Timestamp: now.Add(2 * time.Second),
Message: "3",
},
},
},
{
// The start point is in the middle of the log file.
log: `kernel: [3.000000] 3
kernel: [1.000000] 1
kernel: [2.000000] 2
log: `Jan 2 03:04:04 kernel: [0.000000] 1
Jan 2 03:04:05 kernel: [1.000000] 2
Jan 2 03:04:06 kernel: [2.000000] 3
`,
logs: []types.KernelLog{
{
Timestamp: 1000000,
Message: "1",
Timestamp: now,
Message: "2",
},
{
Timestamp: 2000000,
Timestamp: now.Add(time.Second),
Message: "3",
},
},
},
{
// The start point is at the end of the log file, but we look back.
log: `Jan 2 03:04:03 kernel: [0.000000] 1
Jan 2 03:04:04 kernel: [1.000000] 2
Jan 2 03:04:05 kernel: [2.000000] 3
`,
lookback: "1s",
logs: []types.KernelLog{
{
Timestamp: now.Add(-time.Second),
Message: "2",
},
{
Timestamp: now,
Message: "3",
},
},
},
}
Expand All @@ -84,7 +107,9 @@ func TestGetStartPoint(t *testing.T) {
if err != nil {
t.Fatal(err)
}
w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name()})
w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name(), Lookback: test.lookback})
// Set the fake clock.
w.(*kernelLogWatcher).clock = fakeClock
logCh, err := w.Watch()
if err != nil {
t.Fatal(err)
Expand Down
Loading

0 comments on commit ea83111

Please sign in to comment.