feat: add nvme plugin + refactor physical server
This commit is contained in:
340
roles/munin_client/files/nvme
Normal file
340
roles/munin_client/files/nvme
Normal file
@@ -0,0 +1,340 @@
|
|||||||
|
#! /usr/bin/perl
|
||||||
|
# -*- mode: perl; perl-indent-level: 4 -*-
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
nvme - Munin plugin to monitor the use of NVMe devices
|
||||||
|
|
||||||
|
=head1 APPLICABLE SYSTEMS
|
||||||
|
|
||||||
|
Linux systems with NVMe (Non-Volatile Memory storage attached via PCIe
|
||||||
|
bus).
|
||||||
|
|
||||||
|
=head1 CONFIGURATION
|
||||||
|
|
||||||
|
The plugin uses nvme(1) from the nvme-cli project to read status from
|
||||||
|
the NVMe devices. This requires root access.
|
||||||
|
|
||||||
|
[nvme]
|
||||||
|
user root
|
||||||
|
|
||||||
|
When setting alert levels per device, use graph and basename of device
|
||||||
|
name, e.g., 'nvme0n1', to make environment variable:
|
||||||
|
|
||||||
|
env.nvme_usage_nvme0n1_warning 5:
|
||||||
|
env.nvme_usage_warning 8:
|
||||||
|
|
||||||
|
If your device names change on reboot you can also use the labels
|
||||||
|
(based on serial numbers) to set the warning and critical labels
|
||||||
|
|
||||||
|
env.nvme_usage_SN_1234567_warning 8:101
|
||||||
|
env.nvme_usage_SN_1234567_critical 5:101
|
||||||
|
|
||||||
|
=head1 INTERPRETATION
|
||||||
|
|
||||||
|
This is a multigraph plugin which makes three graphs.
|
||||||
|
|
||||||
|
=head2 nvme_usage
|
||||||
|
|
||||||
|
This reports how much of capacity is allocated in each NVMe
|
||||||
|
"namespace". The report is in percent. This number may not have much
|
||||||
|
relation to actual use, e.g., if deleted data areas have not been
|
||||||
|
trimmed/discarded.
|
||||||
|
|
||||||
|
Default warning and critical: '95', '98'
|
||||||
|
|
||||||
|
=head2 nvme_bytes
|
||||||
|
|
||||||
|
This reports read and write activity on each NVMe device, in bytes per
|
||||||
|
second. Ideally there should be much more read than write. If they
|
||||||
|
are symmetrical, you are using your NVMe as a very expensive FIFO, and
|
||||||
|
if you write more than you read, you should probably look for archival
|
||||||
|
storage instead.
|
||||||
|
|
||||||
|
It is a good idea to compare these numbers to I/O counters from
|
||||||
|
diskstats. If they are much higher, look into whether the write
|
||||||
|
amplification can be due to suboptimal I/O request sizes.
|
||||||
|
|
||||||
|
This graph does not support alerting.
|
||||||
|
|
||||||
|
=head2 nvme_writecycles
|
||||||
|
|
||||||
|
This graphs is intended to give an indication of how much life there
|
||||||
|
is left in your NVMe. It calculates the number of bytes written
|
||||||
|
during each device's lifetime against the capacity of the device,
|
||||||
|
thereby getting an average number of write cycle each cell has
|
||||||
|
experienced.
|
||||||
|
|
||||||
|
A prosumer NVMe will handle a few thousand writes to each cell before
|
||||||
|
the error rate gets out of hand.
|
||||||
|
|
||||||
|
No default values for warning and critical.
|
||||||
|
|
||||||
|
=head2 nvme_spare
|
||||||
|
|
||||||
|
All NVMe has set a side reserve space to remap media errors. This
|
||||||
|
graphs how much is left in percent, taken directly from smart-log
|
||||||
|
output.
|
||||||
|
|
||||||
|
Default warning and critical: '10:', '3:'
|
||||||
|
|
||||||
|
=head1 MAGIC MARKERS
|
||||||
|
|
||||||
|
#%# family=auto
|
||||||
|
#%# capabilities=autoconf
|
||||||
|
|
||||||
|
=head1 BUGS
|
||||||
|
|
||||||
|
None known.
|
||||||
|
|
||||||
|
=head1 VERSION
|
||||||
|
|
||||||
|
1.1
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
|
||||||
|
|
||||||
|
=head1 LICENSE
|
||||||
|
|
||||||
|
GPLv2
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use Munin::Plugin;
|
||||||
|
use IPC::Cmd qw(can_run);
|
||||||
|
use File::Basename;
|
||||||
|
|
||||||
|
# Check that multigraph is supported
|
||||||
|
need_multigraph();
|
||||||
|
|
||||||
|
# Return undef if no problem, otherwise explanation
|
||||||
|
sub autoconf_problem {
|
||||||
|
return if can_run('nvme');
|
||||||
|
if (open(my $mods, '/proc/modules')) {
|
||||||
|
while (<$mods>) {
|
||||||
|
return "missing nvme(1)" if /^nvme[^a-z]/;
|
||||||
|
}
|
||||||
|
close($mods);
|
||||||
|
}
|
||||||
|
return "missing nvme"; # vague message for non-Linux
|
||||||
|
}
|
||||||
|
|
||||||
|
sub run_nvme {
|
||||||
|
my (@cmd) = @_;
|
||||||
|
my @lines;
|
||||||
|
if (can_run('nvme') && open(my $nvme, '-|', 'nvme', @cmd)) {
|
||||||
|
@lines = <$nvme>;
|
||||||
|
close($nvme);
|
||||||
|
warn "nvme: probably needs to run as user root\n" if $? && $> != 0;
|
||||||
|
}
|
||||||
|
@lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub human_to_bytes {
|
||||||
|
my ($str) = @_;
|
||||||
|
my %units = (
|
||||||
|
kB => 1000,
|
||||||
|
MB => 1000_000,
|
||||||
|
GB => 1000_000_000,
|
||||||
|
TB => 1000_000_000_000,
|
||||||
|
PB => 1000_000_000_000_000, # I wish I had need for this
|
||||||
|
);
|
||||||
|
$str =~ /(\d+(\.\d+)?)\s+(.B)/;
|
||||||
|
int($1 * $units{$3});
|
||||||
|
}
|
||||||
|
|
||||||
|
sub nvme_list {
|
||||||
|
# Node SN Model Namespace Usage Format FW Rev
|
||||||
|
# ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
|
||||||
|
# /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7
|
||||||
|
my %devices;
|
||||||
|
|
||||||
|
my $recognised_output;
|
||||||
|
my $lineno = 0;
|
||||||
|
for (run_nvme('list')) {
|
||||||
|
++$lineno;
|
||||||
|
if (m:^Node\s+SN\s+Model\s+Namespace Usage:) {
|
||||||
|
++$recognised_output;
|
||||||
|
} elsif (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
|
||||||
|
$devices{'SN_'.$2} = {
|
||||||
|
device => $1,
|
||||||
|
sn => $2,
|
||||||
|
model => $3,
|
||||||
|
namespace => $4,
|
||||||
|
usage => human_to_bytes($5),
|
||||||
|
capacity => human_to_bytes($6),
|
||||||
|
};
|
||||||
|
} elsif ($lineno > 2) {
|
||||||
|
# could not parse device information
|
||||||
|
$recognised_output = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($lineno && !$recognised_output) {
|
||||||
|
warn "Could not recognise output from 'nvme list', please report\n";
|
||||||
|
}
|
||||||
|
\%devices;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub smart_log {
|
||||||
|
my ($dev) = @_;
|
||||||
|
my %info;
|
||||||
|
for (run_nvme('smart-log', $dev)) {
|
||||||
|
next if /^Smart Log/;
|
||||||
|
if (/(.*?)\s+:\s+(.*)/) {
|
||||||
|
my ($var, $value) = ($1, $2);
|
||||||
|
$var =~ s/\s/_/g;
|
||||||
|
if ($value =~ /^\d+(,\d\d\d)+$/) {
|
||||||
|
$value =~ s/,//g;
|
||||||
|
}
|
||||||
|
$info{lc $var} = $value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return \%info;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub my_print_thresholds {
|
||||||
|
my ($label, $graph, $device, $warn_default, $crit_default) = @_;
|
||||||
|
my $dev = basename($device);
|
||||||
|
my ($warn_label, $crit_label) = get_thresholds($graph, "${graph}_${label}_warning", "${graph}_${label}_critical",
|
||||||
|
$warn_default, $crit_default);
|
||||||
|
my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
|
||||||
|
$warn_label, $crit_label);
|
||||||
|
print "${label}.warning $warn\n" if defined $warn;
|
||||||
|
print "${label}.critical $crit\n" if defined $crit;
|
||||||
|
}
|
||||||
|
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
my $mode = ($ARGV[0] or "print");
|
||||||
|
|
||||||
|
my $problem = autoconf_problem();
|
||||||
|
my $list = nvme_list();
|
||||||
|
|
||||||
|
if ($mode eq 'autoconf') {
|
||||||
|
if (keys %{$list}) {
|
||||||
|
print "yes\n";
|
||||||
|
} else {
|
||||||
|
printf("no (%s)\n", $problem || "no devices to monitor");
|
||||||
|
}
|
||||||
|
exit 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
my @sn = sort keys %{$list};
|
||||||
|
|
||||||
|
if ($mode eq 'config') {
|
||||||
|
my $sn_list = join(' ', @sn);
|
||||||
|
|
||||||
|
print <<'EOF';
|
||||||
|
multigraph nvme_usage
|
||||||
|
graph_title NVME Namespace Usage
|
||||||
|
graph_order $sn_list
|
||||||
|
graph_vlabel Percent used
|
||||||
|
graph_scale no
|
||||||
|
graph_category disk
|
||||||
|
graph_info How much space is used
|
||||||
|
EOF
|
||||||
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
|
print <<"EOF";
|
||||||
|
$_.label $device used
|
||||||
|
$_.type GAUGE
|
||||||
|
$_.max 100
|
||||||
|
$_.min 0
|
||||||
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
|
||||||
|
}
|
||||||
|
print <<'EOF';
|
||||||
|
multigraph nvme_bytes
|
||||||
|
graph_title NVME Bytes Read / Written
|
||||||
|
graph_order $sn_list
|
||||||
|
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
|
||||||
|
graph_category disk
|
||||||
|
graph_info How much data is read and written
|
||||||
|
graph_period second
|
||||||
|
EOF
|
||||||
|
for (@sn) {
|
||||||
|
print <<"EOF";
|
||||||
|
${_}_r.label $list->{$_}->{device}
|
||||||
|
${_}_r.type DERIVE
|
||||||
|
${_}_r.min 0
|
||||||
|
${_}_r.graph no
|
||||||
|
${_}_w.label $list->{$_}->{device}
|
||||||
|
${_}_w.type DERIVE
|
||||||
|
${_}_w.min 0
|
||||||
|
${_}_w.negative ${_}_r
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
print <<'EOF';
|
||||||
|
multigraph nvme_writecycles
|
||||||
|
graph_title NVME Write Cycles
|
||||||
|
graph_order $sn_list
|
||||||
|
graph_vlabel Cycles
|
||||||
|
graph_args --logarithmic
|
||||||
|
graph_category disk
|
||||||
|
graph_info How much data has been written in lifetime divided by capacity
|
||||||
|
EOF
|
||||||
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
|
print <<"EOF";
|
||||||
|
$_.label $device write cycles
|
||||||
|
$_.type GAUGE
|
||||||
|
$_.min 0
|
||||||
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_writecycles', $device);
|
||||||
|
}
|
||||||
|
print <<'EOF';
|
||||||
|
multigraph nvme_spare
|
||||||
|
graph_title Available spare blocks
|
||||||
|
graph_order $sn_list
|
||||||
|
graph_vlabel Percent
|
||||||
|
graph_category disk
|
||||||
|
graph_info Spare capacity for replacing bad blocks
|
||||||
|
EOF
|
||||||
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
|
print <<"EOF";
|
||||||
|
$_.label $device spare capacity
|
||||||
|
$_.type GAUGE
|
||||||
|
$_.min 0
|
||||||
|
$_.max 100
|
||||||
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (@sn) {
|
||||||
|
$list->{$_}->{smart} = smart_log($list->{$_}->{device});
|
||||||
|
}
|
||||||
|
print "multigraph nvme_usage\n";
|
||||||
|
for (@sn) {
|
||||||
|
my $info = $list->{$_};
|
||||||
|
my $used = 100 * $info->{usage} / $info->{capacity};
|
||||||
|
print "$_.value $used\n";
|
||||||
|
}
|
||||||
|
print "multigraph nvme_bytes\n";
|
||||||
|
for (@sn) {
|
||||||
|
my $info = $list->{$_};
|
||||||
|
my $rbytes = $info->{smart}->{data_units_read};
|
||||||
|
my $wbytes = $info->{smart}->{data_units_written};
|
||||||
|
print "${_}_r.value $rbytes\n";
|
||||||
|
print "${_}_w.value $wbytes\n";
|
||||||
|
}
|
||||||
|
print "multigraph nvme_writecycles\n";
|
||||||
|
for (@sn) {
|
||||||
|
my $info = $list->{$_};
|
||||||
|
|
||||||
|
# The unit size reported is 1000 blocks.
|
||||||
|
my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
|
||||||
|
print "$_.value $cycles\n";
|
||||||
|
}
|
||||||
|
print "multigraph nvme_spare\n";
|
||||||
|
for (@sn) {
|
||||||
|
my $info = $list->{$_};
|
||||||
|
|
||||||
|
# The unit size reported is 1000 blocks.
|
||||||
|
my $spare = $info->{smart}->{available_spare};
|
||||||
|
$spare =~ s/%//;
|
||||||
|
print "$_.value $spare\n";
|
||||||
|
}
|
||||||
|
}
|
@@ -15,6 +15,8 @@
|
|||||||
owner: root
|
owner: root
|
||||||
group: root
|
group: root
|
||||||
mode: '0640'
|
mode: '0640'
|
||||||
|
notify:
|
||||||
|
- Restart munin-node
|
||||||
|
|
||||||
- name: Put garage scripts
|
- name: Put garage scripts
|
||||||
ansible.builtin.copy:
|
ansible.builtin.copy:
|
||||||
|
@@ -1,5 +1,43 @@
|
|||||||
---
|
---
|
||||||
|
|
||||||
|
- name: Install prereq
|
||||||
|
ansible.builtin.package:
|
||||||
|
name: "{{ item }}"
|
||||||
|
state: present
|
||||||
|
loop:
|
||||||
|
- nvme-cli
|
||||||
|
- lm-sensors
|
||||||
|
|
||||||
|
- name: Put nvme plugin configuration
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: nvme.j2
|
||||||
|
dest: /etc/munin/plugin-conf.d/nvme
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0640'
|
||||||
|
notify:
|
||||||
|
- Restart munin-node
|
||||||
|
|
||||||
|
- name: Put nvme script
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: files/nvme
|
||||||
|
dest: /etc/munin/plugins/nvme
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
notify:
|
||||||
|
- Restart munin-node
|
||||||
|
|
||||||
|
- name: Configure specific munin plugin
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "/etc/munin/plugins/sensors_{{ item }}"
|
||||||
|
src: /usr/share/munin/plugins/sensors_
|
||||||
|
state: link
|
||||||
|
notify:
|
||||||
|
- Restart munin-node
|
||||||
|
loop:
|
||||||
|
- temp
|
||||||
|
|
||||||
- name: Delete squid plugins
|
- name: Delete squid plugins
|
||||||
ansible.builtin.shell:
|
ansible.builtin.shell:
|
||||||
cmd: "rm -f /etc/munin/plugins/squid_*"
|
cmd: "rm -f /etc/munin/plugins/squid_*"
|
||||||
|
@@ -87,11 +87,6 @@
|
|||||||
- charge
|
- charge
|
||||||
- voltages
|
- voltages
|
||||||
|
|
||||||
# for physical servers
|
|
||||||
- name: Execute specific tasks for physical servers
|
|
||||||
ansible.builtin.include_tasks: physical_servers.yml
|
|
||||||
when: "'hypervisors' in group_names"
|
|
||||||
|
|
||||||
- name: Reconfigure munin-node
|
- name: Reconfigure munin-node
|
||||||
ansible.builtin.shell:
|
ansible.builtin.shell:
|
||||||
cmd: munin-node-configure --shell | sh # noqa: risky-shell-pipe
|
cmd: munin-node-configure --shell | sh # noqa: risky-shell-pipe
|
||||||
|
@@ -1,16 +0,0 @@
|
|||||||
---
|
|
||||||
# for physical servers
|
|
||||||
- name: Install necessary packages for hypervisors
|
|
||||||
ansible.builtin.package:
|
|
||||||
name: lm-sensors
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Configure specific munin plugin
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/etc/munin/plugins/sensors_{{ item }}"
|
|
||||||
src: /usr/share/munin/plugins/sensors_
|
|
||||||
state: link
|
|
||||||
notify:
|
|
||||||
- Restart munin-node
|
|
||||||
loop:
|
|
||||||
- temp
|
|
2
roles/munin_client/templates/nvme.j2
Normal file
2
roles/munin_client/templates/nvme.j2
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[nvme]
|
||||||
|
user root
|
Reference in New Issue
Block a user