Service Health
Service health monitoring in NopeSight provides real-time visibility into the operational status of your business services. By aggregating component health, performance metrics, and dependency status, it delivers a comprehensive view of service health with predictive insights and automated remediation.
Health Monitoring Framework
Health Scoring System
Component Health Scoring
class ComponentHealthScorer:
def __init__(self):
self.metrics_weights = {
'availability': 0.3,
'performance': 0.25,
'error_rate': 0.2,
'resource_utilization': 0.15,
'dependency_health': 0.1
}
def calculate_health_score(self, component):
"""Calculate health score for a component"""
scores = {}
# Availability score (0-100)
availability = self.get_availability_metrics(component)
scores['availability'] = min(100, availability.uptime_percentage)
# Performance score
performance = self.get_performance_metrics(component)
scores['performance'] = self.calculate_performance_score(performance)
# Error rate score
error_metrics = self.get_error_metrics(component)
scores['error_rate'] = max(0, 100 - (error_metrics.error_rate * 100))
# Resource utilization score
resources = self.get_resource_metrics(component)
scores['resource_utilization'] = self.calculate_resource_score(resources)
# Dependency health score
dep_health = self.get_dependency_health(component)
scores['dependency_health'] = dep_health.average_health
# Calculate weighted score
total_score = 0
for metric, weight in self.metrics_weights.items():
total_score += scores[metric] * weight
# Apply penalties
total_score = self.apply_penalties(total_score, component)
return {
'overall_score': total_score,
'component_scores': scores,
'health_state': self.determine_health_state(total_score),
'trend': self.calculate_trend(component, total_score),
'factors': self.identify_health_factors(scores)
}
def calculate_performance_score(self, performance):
"""Calculate performance score based on SLA targets"""
score = 100
# Response time impact
if performance.avg_response_time > performance.sla_target:
overtime_ratio = performance.avg_response_time / performance.sla_target
score -= min(50, (overtime_ratio - 1) * 100)
# Throughput impact
if performance.throughput < performance.expected_throughput * 0.8:
score -= 20
# Latency percentiles
if performance.p99_latency > performance.sla_p99:
score -= 15
return max(0, score)
Service Health Aggregation
class ServiceHealthAggregator {
aggregateServiceHealth(service) {
const componentHealthScores = new Map();
const healthMetrics = {
components: [],
overallHealth: 0,
criticalIssues: [],
warnings: [],
trends: {}
};
// Collect component health
for (const component of service.components) {
const health = this.componentHealthScorer.calculate(component);
componentHealthScores.set(component.id, health);
healthMetrics.components.push({
id: component.id,
name: component.name,
health: health.score,
state: health.state,
criticality: component.criticality
});
// Track issues
if (health.state === 'critical') {
healthMetrics.criticalIssues.push({
component: component.name,
issue: health.primaryIssue,
impact: this.calculateImpact(component, service)
});
} else if (health.state === 'warning') {
healthMetrics.warnings.push({
component: component.name,
issue: health.primaryIssue
});
}
}
// Calculate overall service health
healthMetrics.overallHealth = this.calculateWeightedHealth(
componentHealthScores,
service
);
// Analyze trends
healthMetrics.trends = this.analyzeTrends(service, healthMetrics);
// Determine service state
healthMetrics.state = this.determineServiceState(healthMetrics);
// Add business context
healthMetrics.businessImpact = this.assessBusinessImpact(
service,
healthMetrics
);
return healthMetrics;
}
calculateWeightedHealth(componentScores, service) {
let weightedSum = 0;
let totalWeight = 0;
for (const [componentId, health] of componentScores) {
const component = service.getComponent(componentId);
const weight = this.getComponentWeight(component, service);
weightedSum += health.score * weight;
totalWeight += weight;
}
return totalWeight > 0 ? weightedSum / totalWeight : 0;
}
}
Real-time Health Monitoring
Health Event Stream
class HealthEventProcessor:
def __init__(self):
self.event_stream = EventStream()
self.health_cache = HealthCache()
self.anomaly_detector = AnomalyDetector()
async def process_health_events(self):
"""Process real-time health events"""
async for event in self.event_stream:
# Update component health
component_health = await self.update_component_health(event)
# Check for anomalies
if self.anomaly_detector.is_anomalous(event, component_health):
await self.handle_anomaly(event, component_health)
# Update service health
affected_services = await self.get_affected_services(event.component)
for service in affected_services:
service_health = await self.recalculate_service_health(service)
# Check health state changes
if service_health.state_changed:
await self.handle_state_change(service, service_health)
# Update cache
self.health_cache.update(service.id, service_health)
# Stream health update
await self.stream_health_update(service, service_health)
async def handle_anomaly(self, event, component_health):
"""Handle detected health anomalies"""
anomaly = {
'timestamp': datetime.now(),
'component': event.component,
'type': self.classify_anomaly(event),
'severity': self.assess_severity(event, component_health),
'metrics': event.metrics
}
# Predictive analysis
prediction = await self.predict_impact(anomaly)
if prediction.failure_probability > 0.7:
# Proactive remediation
await self.trigger_remediation(anomaly, prediction)
# Alert if necessary
if anomaly['severity'] >= 'high':
await self.send_anomaly_alert(anomaly, prediction)
Health Dashboard
Health Dashboard Components:
Service Overview:
- Service health score
- Current state indicator
- Trend graph (24h)
- Active issues count
- SLA compliance
Component Matrix:
- Component health grid
- Dependency status
- Resource utilization
- Error rates
- Response times
Health Timeline:
- State changes
- Incidents
- Remediation actions
- Metric anomalies
Predictive Insights:
- Risk predictions
- Capacity forecasts
- Failure probability
- Recommended actions
Health States and Transitions
State Definitions
class HealthStates:
HEALTHY = {
'name': 'Healthy',
'score_range': (90, 100),
'color': 'green',
'description': 'Service operating normally',
'actions': ['monitor']
}
DEGRADED = {
'name': 'Degraded',
'score_range': (70, 89),
'color': 'yellow',
'description': 'Minor issues affecting service',
'actions': ['investigate', 'monitor_closely']
}
AT_RISK = {
'name': 'At Risk',
'score_range': (50, 69),
'color': 'orange',
'description': 'Significant issues, failure risk',
'actions': ['remediate', 'prepare_failover']
}
UNHEALTHY = {
'name': 'Unhealthy',
'score_range': (20, 49),
'color': 'red',
'description': 'Major issues, degraded service',
'actions': ['immediate_action', 'incident']
}
CRITICAL = {
'name': 'Critical',
'score_range': (0, 19),
'color': 'dark_red',
'description': 'Service failure or imminent',
'actions': ['emergency_response', 'failover']
}
State Transition Management
class HealthStateManager {
constructor() {
this.transitions = new Map();
this.history = new HealthHistory();
this.actions = new HealthActions();
}
async handleStateTransition(service, oldState, newState) {
const transition = {
service: service.id,
from: oldState,
to: newState,
timestamp: new Date(),
reason: this.determineTransitionReason(service)
};
// Log transition
await this.history.logTransition(transition);
// Execute transition actions
const actions = this.getTransitionActions(oldState, newState);
for (const action of actions) {
await this.executeAction(action, service, transition);
}
// Notify stakeholders
await this.notifyStateChange(service, transition);
// Update predictions
await this.updatePredictions(service, newState);
}
getTransitionActions(fromState, toState) {
const actions = [];
// Degradation actions
if (this.isDegradation(fromState, toState)) {
actions.push('analyze_root_cause');
actions.push('check_dependencies');
if (toState === 'critical') {
actions.push('activate_incident_response');
actions.push('prepare_failover');
} else if (toState === 'unhealthy') {
actions.push('scale_resources');
actions.push('enable_degraded_mode');
}
}
// Recovery actions
if (this.isRecovery(fromState, toState)) {
actions.push('verify_stability');
actions.push('restore_full_functionality');
actions.push('document_resolution');
}
return actions;
}
}
Predictive Health Analytics
Health Prediction Models
class HealthPredictor:
def __init__(self):
self.models = {
'failure_prediction': self.load_failure_model(),
'degradation_prediction': self.load_degradation_model(),
'capacity_prediction': self.load_capacity_model()
}
self.feature_extractor = FeatureExtractor()
def predict_health_trajectory(self, service, horizon='4h'):
"""Predict future health trajectory"""
predictions = {
'service': service.id,
'current_health': service.current_health,
'predictions': [],
'risks': [],
'recommendations': []
}
# Extract features
features = self.feature_extractor.extract(service)
# Predict health scores
time_points = self.generate_time_points(horizon)
for time_point in time_points:
# Predict health score
predicted_score = self.models['degradation_prediction'].predict(
features=features,
time_offset=time_point
)
# Predict failure probability
failure_prob = self.models['failure_prediction'].predict_proba(
features=features,
time_offset=time_point
)
predictions['predictions'].append({
'time': time_point,
'predicted_health': predicted_score,
'failure_probability': failure_prob,
'confidence': self.calculate_confidence(features, time_point)
})
# Identify risks
if failure_prob > 0.3:
risk = self.analyze_risk(service, predicted_score, failure_prob)
predictions['risks'].append(risk)
# Generate recommendations
predictions['recommendations'] = self.generate_recommendations(
service,
predictions
)
return predictions
def predict_capacity_issues(self, service):
"""Predict capacity-related health issues"""
capacity_metrics = self.get_capacity_metrics(service)
growth_rate = self.calculate_growth_rate(capacity_metrics)
predictions = []
for resource in ['cpu', 'memory', 'storage', 'network']:
current = capacity_metrics[resource]['current']
limit = capacity_metrics[resource]['limit']
# Predict when limit will be reached
if growth_rate[resource] > 0:
time_to_limit = (limit - current) / growth_rate[resource]
if time_to_limit < 168: # Less than 1 week
predictions.append({
'resource': resource,
'current_usage': current,
'limit': limit,
'time_to_limit_hours': time_to_limit,
'growth_rate': growth_rate[resource],
'impact': self.assess_capacity_impact(resource, service),
'mitigation': self.suggest_capacity_mitigation(
resource,
service,
time_to_limit
)
})
return predictions
Anomaly Detection
Anomaly Detection Patterns:
Sudden Degradation:
- Sharp drop in health score
- Multiple metrics affected
- No gradual warning
- Likely causes: deployment, failure
Gradual Degradation:
- Slow health decline
- Resource exhaustion pattern
- Increasing error rates
- Likely causes: memory leak, data growth
Cyclic Patterns:
- Recurring health dips
- Time-based correlation
- Load-related patterns
- Likely causes: batch jobs, peak traffic
Dependency Cascade:
- Upstream component degrades
- Delayed downstream impact
- Propagation pattern
- Likely causes: service dependencies
Automated Health Remediation
Self-Healing Actions
class HealthRemediation:
def __init__(self):
self.remediation_rules = self.load_remediation_rules()
self.action_executor = ActionExecutor()
self.safety_checker = SafetyChecker()
async def remediate_health_issue(self, service, health_issue):
"""Automatically remediate health issues"""
# Find applicable remediation rules
applicable_rules = self.find_applicable_rules(
service,
health_issue
)
if not applicable_rules:
return await self.escalate_to_human(service, health_issue)
# Sort by success probability
sorted_rules = sorted(
applicable_rules,
key=lambda r: r.success_rate * r.confidence,
reverse=True
)
for rule in sorted_rules:
# Safety check
if not self.safety_checker.is_safe(rule, service):
continue
# Execute remediation
result = await self.execute_remediation(rule, service, health_issue)
if result.success:
# Monitor for stability
await self.monitor_remediation(service, result)
return result
else:
# Log failure and try next
await self.log_remediation_failure(rule, result)
# All remediations failed
return await self.escalate_to_human(service, health_issue)
async def execute_remediation(self, rule, service, issue):
"""Execute a remediation action"""
remediation = {
'id': str(uuid.uuid4()),
'service': service.id,
'issue': issue,
'rule': rule.name,
'started_at': datetime.now(),
'actions': []
}
try:
for action in rule.actions:
# Check prerequisites
if not await self.check_prerequisites(action, service):
raise PrerequisiteError(f"Prerequisites not met for {action}")
# Execute action
action_result = await self.action_executor.execute(
action,
service,
issue
)
remediation['actions'].append(action_result)
# Verify improvement
if not await self.verify_improvement(service, issue):
raise RemediationError("No improvement detected")
remediation['success'] = True
remediation['completed_at'] = datetime.now()
except Exception as e:
remediation['success'] = False
remediation['error'] = str(e)
remediation['failed_at'] = datetime.now()
# Rollback if needed
if rule.rollback_on_failure:
await self.rollback_actions(remediation['actions'])
return remediation
Remediation Library
Remediation Actions:
Resource Issues:
high_cpu:
- Scale horizontally
- Optimize queries
- Clear cache
- Restart service
high_memory:
- Increase memory allocation
- Trigger garbage collection
- Restart with higher limits
- Identify memory leaks
disk_space:
- Clean temporary files
- Archive old logs
- Expand storage
- Move to larger volume
Performance Issues:
slow_response:
- Enable caching
- Scale service
- Optimize database queries
- Add indexes
high_error_rate:
- Circuit breaker activation
- Rollback deployment
- Increase timeout
- Retry configuration
Dependency Issues:
upstream_failure:
- Activate fallback
- Use cached data
- Enable degraded mode
- Switch to backup service
Health Reporting
Health Reports
class HealthReportGenerator:
def generate_service_health_report(self, service_id, period='7d'):
"""Generate comprehensive health report"""
service = self.get_service(service_id)
health_data = self.get_health_history(service_id, period)
report = {
'service': service.to_dict(),
'period': period,
'generated_at': datetime.now(),
'summary': self.generate_summary(health_data),
'detailed_analysis': {},
'recommendations': []
}
# Health score analysis
report['detailed_analysis']['health_scores'] = {
'average': health_data.avg_health_score,
'minimum': health_data.min_health_score,
'maximum': health_data.max_health_score,
'standard_deviation': health_data.health_score_std,
'trend': self.calculate_trend(health_data.health_scores)
}
# State distribution
report['detailed_analysis']['state_distribution'] =
self.calculate_state_distribution(health_data)
# Incident analysis
report['detailed_analysis']['incidents'] = {
'total_count': len(health_data.incidents),
'mttr': self.calculate_mttr(health_data.incidents),
'categories': self.categorize_incidents(health_data.incidents),
'root_causes': self.analyze_root_causes(health_data.incidents)
}
# Component health
report['detailed_analysis']['component_health'] =
self.analyze_component_health(service, health_data)
# Predictions
report['predictions'] = self.generate_predictions(service, health_data)
# Recommendations
report['recommendations'] = self.generate_recommendations(
service,
report['detailed_analysis']
)
return report
Health Dashboards
// Executive Health Dashboard
const executiveHealthDashboard = {
widgets: [
{
id: 'service-portfolio-health',
type: 'portfolio-grid',
data: {
services: getAllServices(),
metrics: ['health_score', 'availability', 'incidents'],
groupBy: 'business_unit',
sortBy: 'criticality'
}
},
{
id: 'health-trends',
type: 'multi-line-chart',
data: {
series: getTopServices(10),
metric: 'health_score',
period: '30d',
granularity: 'daily'
}
},
{
id: 'risk-matrix',
type: 'scatter-plot',
data: {
x_axis: 'failure_probability',
y_axis: 'business_impact',
points: getServiceRiskProfile(),
quadrants: getRiskQuadrants()
}
}
]
};
// Operational Health Dashboard
const operationalHealthDashboard = {
widgets: [
{
id: 'real-time-health',
type: 'live-health-monitor',
data: {
services: getCriticalServices(),
updateInterval: 5000,
showAlerts: true,
showMetrics: ['health', 'response_time', 'error_rate']
}
},
{
id: 'component-health-map',
type: 'treemap',
data: {
hierarchy: getServiceComponentHierarchy(),
sizeBy: 'traffic_volume',
colorBy: 'health_score',
drilldown: true
}
},
{
id: 'remediation-status',
type: 'activity-feed',
data: {
activities: getRecentRemediations(),
showOutcome: true,
filterBy: ['automated', 'manual']
}
}
]
};
Best Practices
1. Health Monitoring Setup
- ✅ Define meaningful health metrics
- ✅ Set appropriate thresholds
- ✅ Include business context
- ✅ Regular baseline updates
2. Scoring Accuracy
- ✅ Weight components properly
- ✅ Consider dependencies
- ✅ Validate with incidents
- ✅ Continuous calibration
3. Automation
- ✅ Start with monitoring
- ✅ Gradual automation
- ✅ Safety mechanisms
- ✅ Human oversight
4. Communication
- ✅ Clear health states
- ✅ Actionable alerts
- ✅ Regular reporting
- ✅ Stakeholder visibility
Integration Examples
Monitoring Tool Integration
class MonitoringIntegration:
def sync_health_metrics(self):
"""Sync health metrics with monitoring tools"""
services = self.get_all_services()
for service in services:
# Get NopeSight health data
health = self.health_monitor.get_service_health(service.id)
# Push to Prometheus
self.prometheus.push_metrics({
'service_health_score': health.overall_score,
'service_health_state': health.state_numeric,
'component_health_scores': health.component_scores
}, labels={
'service': service.name,
'environment': service.environment,
'criticality': service.criticality
})
# Update Datadog
self.datadog.send_service_check(
'nopesight.service.health',
self.map_health_to_datadog_status(health.state),
tags=[
f'service:{service.name}',
f'health_score:{health.overall_score}'
],
message=health.summary
)
# Create Grafana annotations
if health.state_changed:
self.grafana.create_annotation({
'tags': ['health_change', service.name],
'text': f'Service health changed to {health.state}',
'time': health.timestamp
})
Incident Management Integration
Incident Integration:
Health Degradation:
- Health state changes to unhealthy/critical
- Incident automatically created
- Severity based on business impact
- Health data attached to incident
Incident Updates:
- Health improvements noted
- Recovery actions logged
- Resolution verified by health state
- Post-incident health analysis
Predictive Incidents:
- High failure probability detected
- Proactive incident created
- Preventive actions initiated
- Success/failure tracked
Next Steps
- 📖 Visualization - Visualizing service maps and health
- 📖 Best Practices - Service mapping best practices
- 📖 Integration Guide - External integrations