feat: execution failure reports, auto-printer for WMT, UTC timezone fix for all timestamps

This commit is contained in:
ske087
2026-04-24 15:52:12 +03:00
parent d2485e4c66
commit 056f467791
27 changed files with 1391 additions and 285 deletions

View File

@@ -23,6 +23,7 @@ class AnsibleService:
SETTINGS_FILE = Path("data/ansible_settings.json")
DEFAULT_SETTINGS = {
"ssh_fallback_password": "raspberry",
"use_password_auth": False,
}
def __init__(self):
@@ -30,13 +31,16 @@ class AnsibleService:
self.ansible_dir = Path("ansible")
self.inventory_file = self.ansible_dir / "inventory" / "dynamic_inventory.yaml"
self.playbook_dir = self.ansible_dir / "playbooks"
self.ssh_key_path = Path.home() / ".ssh" / "ansible_key"
self.ssh_keys_dir = self.ansible_dir / "ssh_keys"
self.ssh_key_path = self.ssh_keys_dir / "app_key"
self.ansible_cfg_path = self.ansible_dir / "ansible.cfg"
# Ensure directories exist
self.ansible_dir.mkdir(exist_ok=True)
(self.ansible_dir / "inventory").mkdir(exist_ok=True)
(self.ansible_dir / "playbooks").mkdir(exist_ok=True)
(self.ansible_dir / "roles").mkdir(exist_ok=True)
self.ssh_keys_dir.mkdir(mode=0o700, exist_ok=True)
# ------------------------------------------------------------------ #
# Settings helpers #
@@ -136,12 +140,24 @@ class AnsibleService:
'ansible_host': '127.0.0.1'
}
else:
hvars = {
'ansible_host': device.device_ip,
'ansible_user': 'pi',
'ansible_ssh_private_key_file': str(self.ssh_key_path),
'ansible_ssh_common_args': '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
}
settings = self.load_settings()
use_password = settings.get('use_password_auth', False)
ssh_password = settings.get('ssh_fallback_password', '')
if use_password and ssh_password:
hvars = {
'ansible_host': device.device_ip,
'ansible_user': 'pi',
'ansible_password': ssh_password,
'ansible_become_password': ssh_password,
'ansible_ssh_common_args': '-o PubkeyAuthentication=no -o PreferredAuthentications=password -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
}
else:
hvars = {
'ansible_host': device.device_ip,
'ansible_user': 'pi',
'ansible_ssh_private_key_file': str(self.ssh_key_path.resolve()),
'ansible_ssh_common_args': '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
}
children['monitoring_devices']['hosts'][device.hostname] = hvars
synced += 1
self._write_inventory(data)
@@ -249,7 +265,7 @@ class AnsibleService:
'name': 'Update monitoring devices',
'hosts': 'all',
'become': True,
'gather_facts': True,
'gather_facts': False,
'tasks': [
{
'name': 'Update apt cache',
@@ -268,40 +284,24 @@ class AnsibleService:
'register': 'upgrade_result'
},
{
'name': 'Restart device if required',
'reboot': {
'reboot_timeout': 600
},
'when': 'upgrade_result.changed'
},
{
'name': 'Check service status',
'systemd': {
'name': 'prezenta.service',
'state': 'started'
'name': 'Show upgrade result',
'debug': {
'msg': '{{ upgrade_result.stdout_lines }}'
}
},
{
'name': 'Report update completion',
'uri': {
'url': 'http://{{ ansible_controller_ip }}/api/update_complete',
'method': 'POST',
'body_format': 'json',
'body': {
'hostname': '{{ inventory_hostname }}',
'device_ip': '{{ ansible_host }}',
'status': 'completed',
'packages_updated': '{{ upgrade_result.stdout_lines | length }}'
}
'name': 'Clean up apt cache',
'apt': {
'autoclean': True
}
}
]
}
playbook_path = self.playbook_dir / "update_devices.yml"
with open(playbook_path, 'w') as f:
yaml.dump([playbook_content], f, default_flow_style=False)
return str(playbook_path)
def create_restart_service_playbook(self) -> str:
@@ -390,6 +390,17 @@ class AnsibleService:
# Add extra variables
if extra_vars:
cmd.extend(['--extra-vars', json.dumps(extra_vars)])
# Inject password auth vars if enabled (overrides per-host inventory vars)
settings = self.load_settings()
if settings.get('use_password_auth') and settings.get('ssh_fallback_password'):
pwd = settings['ssh_fallback_password']
cmd.extend(['--extra-vars', json.dumps({
'ansible_password': pwd,
'ansible_become_password': pwd,
'ansible_ssh_private_key_file': '',
'ansible_ssh_common_args': '-o PubkeyAuthentication=no -o PreferredAuthentications=password -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
})])
# Create enhanced execution record using new model
execution_id = str(uuid.uuid4())
@@ -416,12 +427,19 @@ class AnsibleService:
with tempfile.NamedTemporaryFile(mode='w+', suffix='.log', delete=False) as log_file:
log_file_path = log_file.name
env = os.environ.copy()
env['PYTHONUNBUFFERED'] = '1'
env['ANSIBLE_FORCE_COLOR'] = '0'
env['ANSIBLE_NOCOLOR'] = '1'
env['ANSIBLE_CONFIG'] = str(self.ansible_cfg_path.resolve())
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=str(self.ansible_dir)
cwd=str(self.ansible_dir),
env=env,
)
stdout, stderr = process.communicate()
@@ -435,11 +453,12 @@ class AnsibleService:
execution.stderr_log = stderr
execution.ansible_log_file = log_file_path
# Always parse recap stats regardless of exit code —
# Ansible exits non-zero when any host fails/is unreachable.
self._parse_ansible_results_enhanced(execution, stdout)
if process.returncode == 0:
execution.status = 'completed'
execution.summary_message = 'Playbook executed successfully'
# Parse stdout for success/failure counts
self._parse_ansible_results_enhanced(execution, stdout)
else:
execution.status = 'failed'
execution.summary_message = f'Playbook failed with exit code {process.returncode}'
@@ -474,10 +493,14 @@ class AnsibleService:
def execute_playbook_async(self, playbook_name: str, limit_hosts: List[str] = None,
extra_vars: Dict = None, priority: int = 5,
max_retries: int = 0) -> Dict:
max_retries: int = 0,
force_password_auth: bool = False) -> Dict:
"""
Start a playbook in a background thread.
Returns immediately with the execution_id so the caller can poll /live.
force_password_auth=True overrides the use_password_auth setting and always
injects password vars — used by distribute_ssh_keys which must run before
keys are deployed.
"""
try:
self.generate_dynamic_inventory()
@@ -498,6 +521,17 @@ class AnsibleService:
# Pass all extra vars as a single JSON string to avoid value-quoting issues
cmd.extend(['--extra-vars', json.dumps(extra_vars)])
# Inject password auth vars if enabled OR forced
settings = self.load_settings()
if (force_password_auth or settings.get('use_password_auth')) and settings.get('ssh_fallback_password'):
pwd = settings['ssh_fallback_password']
cmd.extend(['--extra-vars', json.dumps({
'ansible_password': pwd,
'ansible_become_password': pwd,
'ansible_ssh_private_key_file': '',
'ansible_ssh_common_args': '-o PubkeyAuthentication=no -o PreferredAuthentications=password -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
})])
# Create a persistent log file (NOT deleted on close)
log_fd, log_file_path = tempfile.mkstemp(suffix='.log', prefix='ansible_')
os.close(log_fd)
@@ -546,6 +580,7 @@ class AnsibleService:
env['PYTHONUNBUFFERED'] = '1'
env['ANSIBLE_FORCE_COLOR'] = '0'
env['ANSIBLE_NOCOLOR'] = '1'
env['ANSIBLE_CONFIG'] = str(self.ansible_cfg_path.resolve())
process = subprocess.Popen(
cmd,
@@ -586,10 +621,12 @@ class AnsibleService:
execution.completed_at = datetime.utcnow()
execution.exit_code = process.returncode
execution.stdout_log = full_output
# Always parse recap stats regardless of exit code —
# Ansible exits non-zero when any host fails/is unreachable.
self._parse_ansible_results_enhanced(execution, full_output)
if process.returncode == 0:
execution.status = 'completed'
execution.summary_message = 'Playbook executed successfully'
self._parse_ansible_results_enhanced(execution, full_output)
else:
execution.status = 'failed'
execution.summary_message = f'Playbook failed (exit {process.returncode})'
@@ -653,62 +690,62 @@ class AnsibleService:
return {'success': False, 'error': str(e)}
def _parse_ansible_results_enhanced(self, execution: PlaybookExecution, output: str):
"""Parse Ansible output for enhanced result statistics"""
lines = output.split('\n')
"""Parse Ansible PLAY RECAP output for result statistics."""
import re
successful_hosts = 0
failed_hosts = 0
unreachable_hosts = 0
skipped_hosts = 0
changed_hosts = 0
for line in lines:
if 'ok=' in line and 'changed=' in line:
# Parse line like: "host1: ok=4 changed=2 unreachable=0 failed=0"
try:
if 'failed=0' in line:
successful_hosts += 1
else:
failed_count = int(line.split('failed=')[1].split()[0])
if failed_count > 0:
failed_hosts += 1
else:
successful_hosts += 1
if 'unreachable=' in line:
unreachable = int(line.split('unreachable=')[1].split()[0])
if unreachable > 0:
unreachable_hosts += 1
if 'skipped=' in line:
skipped = int(line.split('skipped=')[1].split()[0])
if skipped > 0:
skipped_hosts += 1
if 'changed=' in line:
changed = int(line.split('changed=')[1].split()[0])
if changed > 0:
changed_hosts += 1
except (ValueError, IndexError):
# Skip malformed lines
continue
# Update execution record
execution.successful_hosts = successful_hosts
execution.failed_hosts = failed_hosts
# Match PLAY RECAP lines:
# "RPI-FOO : ok=4 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0"
recap_re = re.compile(
r'ok=(\d+)\s+changed=(\d+)\s+unreachable=(\d+)\s+failed=(\d+)'
)
for line in output.split('\n'):
m = recap_re.search(line)
if not m:
continue
ok = int(m.group(1))
changed = int(m.group(2))
unreachable = int(m.group(3))
failed = int(m.group(4))
if unreachable > 0:
unreachable_hosts += 1
elif failed > 0:
failed_hosts += 1
else:
successful_hosts += 1
if changed > 0:
changed_hosts += 1
execution.successful_hosts = successful_hosts
execution.failed_hosts = failed_hosts
execution.unreachable_hosts = unreachable_hosts
execution.skipped_hosts = skipped_hosts
execution.changed_hosts = changed_hosts
execution.skipped_hosts = skipped_hosts
execution.changed_hosts = changed_hosts
def _get_playbook_description(self, playbook_name: str) -> str:
"""Get user-friendly description for playbook"""
descriptions = {
'update_devices': 'Update all packages and monitoring software on devices',
'restart_service': 'Restart monitoring services on selected devices',
'restart_service': 'Restart monitoring services on selected devices',
'system_health': 'Check system health and monitoring status',
'maintenance_mode': 'Put devices in maintenance mode'
'maintenance_mode': 'Put devices in maintenance mode',
'distribute_ssh_keys': 'Push server public key to all devices using password auth',
}
return descriptions.get(playbook_name, f'Execute {playbook_name} playbook')
def create_distribute_ssh_keys_playbook(self) -> str:
"""Ensure the distribute_ssh_keys playbook file exists (ships with the repo)."""
playbook_path = self.playbook_dir / 'distribute_ssh_keys.yml'
if not playbook_path.exists():
logging.warning('distribute_ssh_keys.yml not found — playbook file is missing')
return str(playbook_path)
def create_system_health_playbook(self) -> str:
"""Create system health check playbook"""
@@ -782,6 +819,53 @@ class AnsibleService:
unreachable = int(line.split('unreachable=')[1].split()[0])
execution.unreachable_hosts += unreachable
def test_password_auth(self, device_ip: str, password: str,
username: str = 'pi', port: int = 22) -> Dict:
"""
Test SSH connectivity using password-only authentication (no key fallback).
Uses sshpass so we can confirm the exact password works before deploying keys.
"""
try:
# Quick TCP reachability check first
import socket
with socket.create_connection((device_ip, port), timeout=5):
pass
except (OSError, ConnectionRefusedError) as e:
return {'success': False, 'reachable': False,
'error': f'Host unreachable on port {port}: {e}'}
try:
result = subprocess.run(
[
'sshpass', '-p', password,
'ssh',
'-o', 'PubkeyAuthentication=no',
'-o', 'PreferredAuthentications=password',
'-o', 'StrictHostKeyChecking=no',
'-o', 'UserKnownHostsFile=/dev/null',
'-o', f'ConnectTimeout=8',
'-p', str(port),
f'{username}@{device_ip}',
'echo OK',
],
capture_output=True, text=True, timeout=15,
)
if result.returncode == 0 and 'OK' in result.stdout:
return {'success': True, 'reachable': True,
'message': f'Password authentication succeeded for {username}@{device_ip}'}
else:
stderr = (result.stderr or '').strip()
return {'success': False, 'reachable': True,
'error': f'Authentication failed — {stderr or "wrong password"}'}
except subprocess.TimeoutExpired:
return {'success': False, 'reachable': True,
'error': 'SSH command timed out'}
except FileNotFoundError:
return {'success': False, 'reachable': True,
'error': 'sshpass not installed — run: sudo apt-get install sshpass'}
except Exception as e:
return {'success': False, 'reachable': True, 'error': str(e)}
def test_ssh_connectivity(self, device_ip: str, username: str = 'pi') -> Dict:
"""Test SSH connectivity to a device"""
try: