The title describes a straightforward script below, designed to log various critical metrics on your Ubuntu-based system effectively
Functionalities:
- Temperature and System Utilization Monitoring:
— Logs CPU and GPU temperatures.
— Tracks CPU utilization, RAM usage, and per-core CPU utilization.
— Logs metrics every 2 seconds. - Network Usage Monitoring:
— Logs network usage statistics using the ifstat command.
— Records data every 5 seconds. - Docker Container Monitoring:
— Logs CPU and memory usage of all running Docker containers.
— Records data every 10 seconds. - File and Directory Change Monitoring:
— Monitors file and directory changes in a specified path.
— Logs changes (created, modified, deleted) once every 10 minutes. - User Login Monitoring:
— Logs the current users logged into the system using the who command.
— Records data every minute. - System Logs Parsing:
— Parses /var/log/syslog for error patterns.
— Logs any matches found every hour. - Disk Usage Monitoring:
— Logs disk usage statistics for each partition.
— Includes percentage used, used GB, remaining GB, and total GB.
— Issues warnings if disk usage exceeds 90%.
— Logs every 5 minutes. - Kernel Error Monitoring:
— Parses /var/log/kern.log for kernel errors.
— Logs any matches found every minute. - Reboot Times Logging:
— Logs the times of the last system reboots.
— Records data every hour. - Cache Memory Monitoring:
— Logs the amount of cache memory used by the system.
— Records data every 5 minutes. - SSD Health Monitoring with smartctl:
— Automatically detects the SSD device.
— Runs smartctl -a on the detected SSD to monitor its health.
— Logs the output every 30 seconds. - Generate NVIDIA Bug Reports
— Runs nvidia bug report script to obtain the tar file for further debugging. - Integrate OpenVAS to run periodic vuln scans
— OpenVAS integration to generate reports
Dependencies:
sudo apt install smartmontools ifstat -y
sudo apt install python3-pip
pip3 install psutil pytz watchdog
import subprocess
import time
import datetime
import psutil
import logging
from logging.handlers import RotatingFileHandler
import pytz
import threading
import re
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
# Log config
LOG_MAX_SIZE = 10 * 1024 * 1024 # 10 MB for general monitoring logs
LOG_BACKUP_COUNT = 10
# Set the timezone to UTC
utc = pytz.timezone('UTC')
def generate_nvidia_bug_report(logger):
try:
while True:
# Run the NVIDIA bug report tool
result = subprocess.run(['nvidia-bug-report.sh'], capture_output=True, text=True)
if result.returncode == 0:
logger.info("NVIDIA bug report generated successfully.")
# Save or process the bug report output as needed
else:
logger.error(f"Failed to generate NVIDIA bug report: {result.stderr}")
except Exception as e:
logger.error(f"Exception while generating NVIDIA bug report: {str(e)}")
time.sleep(30)
# Setup logging function
def setup_logging(log_filename):
logger = logging.getLogger(log_filename)
logger.setLevel(logging.INFO)
handler = RotatingFileHandler(log_filename, maxBytes=LOG_MAX_SIZE, backupCount=LOG_BACKUP_COUNT)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
# Auto-detect SSD and run smartctl
def detect_ssd_and_log_smartctl(smartctl_logger):
while True:
# Detect the mounted SSD
ssd_path = None
for disk in psutil.disk_partitions(all=False):
if 'nvme' in disk.device or 'sd' in disk.device:
ssd_path = disk.device.replace('/dev/', '')
break
if ssd_path:
try:
# Run smartctl command
timestamp = datetime.datetime.now(utc).strftime("%Y-%m-%d %H:%M:%S UTC")
result = subprocess.run(['smartctl', '-a', f'/dev/{ssd_path}'], capture_output=True, text=True)
if result.returncode == 0:
smartctl_logger.info(f"Timestamp: {timestamp}\n{result.stdout}")
print(f"Logged smartctl data for {ssd_path} at {timestamp}")
else:
error_message = f"Error running smartctl for {ssd_path}: {result.stderr}"
smartctl_logger.error(error_message)
print(error_message)
except Exception as e:
error_message = f"Exception occurred while running smartctl for {ssd_path}: {e}"
smartctl_logger.error(error_message)
print(error_message)
else:
smartctl_logger.warning("No SSD detected.")
print("No SSD detected.")
time.sleep(30)
# Log temperatures and system utilization
def log_temperatures(temp_logger):
while True:
try:
cpu_temp_output = subprocess.check_output(["sensors"], text=True)
# Modify the command according to your nvidia GPU
gpu_temp_output = subprocess.check_output(["nvidia-smi", "--query-gpu=temperature.gpu", "--format=csv,noheader,nounits"], text=True)
# Get CPU utilization
cpu_utilization = psutil.cpu_percent(interval=1)
# Get RAM utilization
ram_utilization = psutil.virtual_memory().percent
# Get CPU core utilization
cpu_cores_utilization = psutil.cpu_percent(interval=1, percpu=True)
timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")
temp_logger.info(f"Timestamp UTC: {timestamp}")
temp_logger.info(f"CPU Temp: {cpu_temp_output.strip()}, GPU Temp: {gpu_temp_output.strip()}, CPU Utilization: {cpu_utilization}%, RAM Utilization: {ram_utilization}%, CPU Core Utilization: {cpu_cores_utilization}")
print(f"Temperature and system metrics logged at {timestamp}")
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
temp_logger.error(f"Error: {e}")
time.sleep(60)
# Log network monitoring data
def log_network_usage(net_logger):
while True:
try:
timestamp = datetime.datetime.now(utc).strftime("%Y-%m-%d %H:%M:%S UTC")
log_message = f"Timestamp: {timestamp}"
print(log_message)
net_logger.info(log_message)
# Run ifstat for 1 sec and log the output
result = subprocess.run(['ifstat', '30', '1'], capture_output=True, text=True)
if result.returncode == 0:
# Process and log the ifstat output
for line in result.stdout.splitlines()[0:]:
log_message = f"[{timestamp}] {line}"
print(log_message)
net_logger.info(log_message)
else:
error_message = f"Error: {result.stderr}"
print(error_message)
net_logger.error(error_message)
except Exception as e:
exception_message = f"Exception occurred: {e}"
print(exception_message)
net_logger.error(exception_message)
time.sleep(60)
# Log Docker container stats
def log_docker_stats(docker_logger):
while True:
try:
# Get the current timestamp in UTC
timestamp = datetime.datetime.now(utc).strftime("%Y-%m-%d %H:%M:%S UTC")
# Run docker stats command
result = subprocess.run(['docker', 'stats', '--no-stream', '--format', '"{{.Name}}: CPU={{.CPUPerc}} MEM={{.MemUsage}}"'], capture_output=True, text=True)
if result.returncode == 0:
# Process and log the docker stats output
for line in result.stdout.splitlines():
log_message = f"[{timestamp}] {line}"
print(log_message)
docker_logger.info(log_message)
else:
error_message = f"Error: {result.stderr}"
print(error_message)
docker_logger.error(error_message)
except Exception as e:
exception_message = f"Exception occurred: {e}"
print(exception_message)
docker_logger.error(exception_message)
time.sleep(60)
# Monitor file/directory changes
class ChangeHandler(FileSystemEventHandler):
def __init__(self, file_logger):
self.file_logger = file_logger
def on_modified(self, event):
self.file_logger.info(f"File modified: {event.src_path}")
def on_created(self, event):
self.file_logger.info(f"File created: {event.src_path}")
def on_deleted(self, event):
self.file_logger.info(f"File deleted: {event.src_path}")
def monitor_file_changes(path, file_logger):
while True:
event_handler = ChangeHandler(file_logger)
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
time.sleep(1) # Monitor for a short period to catch changes
observer.stop()
observer.join()
time.sleep(60)
# Log users and who are logged in
def log_logged_in_users(user_logger):
while True:
try:
timestamp = datetime.datetime.now(utc).strftime("%Y-%m-%d %H:%M:%S UTC")
who_output = subprocess.check_output(['who'], text=True)
user_logger.info(f"Timestamp: {timestamp}\nUsers logged in:\n{who_output}")
print(f"Logged in users logged at {timestamp}")
except subprocess.CalledProcessError as e:
user_logger.error(f"Error: {e}")
time.sleep(60)
# Parse system logs for errors
def parse_system_logs(log_file, logger):
error_patterns = [r'error', r'fail', r'critical'] # Add more patterns as needed
compiled_patterns = [re.compile(pat, re.IGNORECASE) for pat in error_patterns]
with open(log_file, 'r') as f:
for line in f:
if any(pat.search(line) for pat in compiled_patterns):
logger.info(f"Log match found: {line.strip()}")
time.sleep(30)
def log_system_logs(log_logger):
while True:
try:
parse_system_logs('/var/log/syslog', log_logger)
except Exception as e:
log_logger.error(f"Error parsing system logs: {e}")
# Log disk usage
def monitor_disk_usage(disk_logger, threshold=90):
while True:
for part in psutil.disk_partitions():
usage = psutil.disk_usage(part.mountpoint)
total_gb = usage.total / (1024 ** 3)
free_gb = usage.free / (1024 ** 3)
used_gb = total_gb - free_gb
disk_logger.info(f"Disk usage on {part.mountpoint}: {usage.percent}% used, {used_gb:.2f} GB used, {free_gb:.2f} GB remaining out of {total_gb:.2f} GB total")
if usage.percent >= threshold:
disk_logger.warning(f"Disk usage alert on {part.mountpoint}: {usage.percent}% used, {used_gb:.2f} GB used, {free_gb:.2f} GB remaining out of {total_gb:.2f} GB total")
time.sleep(300)
# Log kernel errors from kern.log
def log_kernel_errors(kernel_logger):
while True:
try:
parse_system_logs('/var/log/kern.log', kernel_logger)
except Exception as e:
kernel_logger.error(f"Error parsing kernel logs: {e}")
time.sleep(60)
# Log last reboot times
def log_last_reboot_times(reboot_logger):
while True:
try:
last_reboot_output = subprocess.check_output(['last', 'reboot'], text=True)
reboot_logger.info(f"Last reboots:\n{last_reboot_output}")
print("Reboot times logged")
except subprocess.CalledProcessError as e:
reboot_logger.error(f"Error: {e}")
time.sleep(3600)
# Log cache memory status
def log_cache_memory_status(cache_logger):
while True:
try:
mem_info = psutil.virtual_memory()
cache_logger.info(f"Cache memory: {mem_info.cached / 1024 / 1024} MB")
except Exception as e:
cache_logger.error(f"Error fetching cache memory status: {e}")
time.sleep(300)
if __name__ == "__main__":
# Setup individual loggers for each task
temp_logger = setup_logging('temperature.log')
net_logger = setup_logging('network.log')
docker_logger = setup_logging('docker.log')
file_logger = setup_logging('file_changes.log')
user_logger = setup_logging('user_logins.log')
sys_log_logger = setup_logging('system_logs.log')
disk_logger = setup_logging('disk_usage.log')
kernel_logger = setup_logging('kernel_errors.log')
reboot_logger = setup_logging('reboot_times.log')
cache_logger = setup_logging('cache_memory.log')
smartctl_logger = setup_logging('smartctl.log')
nvidia_logger = setup_logging('nvidia_bug.log')
# Setup threads
threads = []
# Example task initialization and thread starting
tasks = [
("docker.log", 10, log_docker_stats, 'Detailed Docker Monitoring'),
("temperature.log", 10, log_temperatures, 'Temperature Monitoring'),
("network.log", 10, log_network_usage, 'Network Monitoring'),
("file_changes.log", 10, monitor_file_changes, 'File Change Monitoring'),
("user_logins.log", 10, log_logged_in_users, 'User Login Monitoring'),
("system_logs.log", 10, log_system_logs, 'System Log Parsing'),
("disk_usage.log", 10, monitor_disk_usage, 'Disk Usage Monitoring'),
("kernel_errors.log", 10, log_kernel_errors, 'Kernel Error Monitoring'),
("reboot_times.log", 10, log_last_reboot_times, 'Reboot Times Logging'),
("cache_memory.log", 10, log_cache_memory_status, 'Cache Memory Status'),
("smartctl.log", 10, detect_ssd_and_log_smartctl, 'SSD SMART Monitoring'),
("nvidia_bug.log", 10, generate_nvidia_bug_report, 'Generate nvidia bug report')
]
for log_filename, interval, function, description in tasks:
logger = setup_logging(log_filename)
if description == 'File Change Monitoring':
thread = threading.Thread(target=function, args=('.', logger))
else:
thread = threading.Thread(target=function, args=(logger,))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
You can also add a function for a custom API to help you get this data out of the system and monitor it remotely.