Init
This commit is contained in:
@@ -83,6 +83,25 @@ impl DiagnosticEngine {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Diagnoses a node with custom context (e.g., mass pod failures)
|
||||
pub async fn diagnose_node_with_context(
|
||||
&self,
|
||||
ai_client: &super::AIClient,
|
||||
_node_name: &str,
|
||||
context: &str,
|
||||
) -> Result<String, Box<dyn std::error::Error>> {
|
||||
let full_description = format!(
|
||||
"{}. Use the get_node_details tool to inspect the node if needed.",
|
||||
context
|
||||
);
|
||||
|
||||
let tools: Vec<ChatCompletionTools> =
|
||||
vec![ChatCompletionTools::Function(tools::get_node_details_tool())];
|
||||
|
||||
self.run_diagnosis(ai_client, full_description, tools)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn run_diagnosis(
|
||||
&self,
|
||||
ai_client: &super::AIClient,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use serde::Deserialize;
|
||||
use std::fs;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Config {
|
||||
pub api_base: String,
|
||||
pub api_key: String,
|
||||
@@ -9,6 +9,8 @@ pub struct Config {
|
||||
pub system_prompt: String,
|
||||
#[serde(default = "default_max_concurrent_diagnoses")]
|
||||
pub max_concurrent_diagnoses: usize,
|
||||
pub telegram_bot_token: Option<String>,
|
||||
pub telegram_chat_id: Option<String>,
|
||||
}
|
||||
|
||||
fn default_max_concurrent_diagnoses() -> usize {
|
||||
|
||||
117
src/events/correlation.rs
Normal file
117
src/events/correlation.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Tracks issues by node to detect patterns and enable smart correlation
|
||||
#[derive(Clone)]
|
||||
pub struct CorrelationEngine {
|
||||
// node_name -> (pod_name, timestamp)
|
||||
node_issues: Arc<RwLock<HashMap<String, Vec<(String, Instant)>>>>,
|
||||
// node_name -> timestamp of last diagnosis (to avoid duplicate diagnoses)
|
||||
diagnosed_nodes: Arc<RwLock<HashMap<String, Instant>>>,
|
||||
}
|
||||
|
||||
impl CorrelationEngine {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
node_issues: Arc::new(RwLock::new(HashMap::new())),
|
||||
diagnosed_nodes: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Records a pod issue associated with a node
|
||||
pub async fn record_pod_issue(&self, node_name: &str, pod_name: String) {
|
||||
let mut issues = self.node_issues.write().await;
|
||||
let node_issues = issues.entry(node_name.to_string()).or_insert_with(Vec::new);
|
||||
|
||||
// Clean old entries (older than 5 minutes)
|
||||
node_issues.retain(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(300));
|
||||
|
||||
// Add new issue
|
||||
node_issues.push((pod_name, Instant::now()));
|
||||
|
||||
debug!(node = %node_name, count = node_issues.len(), "Recorded pod issue on node");
|
||||
}
|
||||
|
||||
/// Checks if there's a mass failure on a node (>5 pods in last 60 seconds)
|
||||
/// Also checks if we already diagnosed this node recently to avoid duplicates
|
||||
pub async fn is_mass_failure(&self, node_name: &str) -> (bool, Vec<String>) {
|
||||
// First check if we already diagnosed this node recently (within last 5 minutes)
|
||||
let diagnosed = self.diagnosed_nodes.read().await;
|
||||
if let Some(last_diagnosis) = diagnosed.get(node_name) {
|
||||
if last_diagnosis.elapsed() < Duration::from_secs(300) {
|
||||
debug!(
|
||||
node = %node_name,
|
||||
elapsed_secs = last_diagnosis.elapsed().as_secs(),
|
||||
"Skipping duplicate diagnosis - node recently diagnosed"
|
||||
);
|
||||
return (false, vec![]);
|
||||
}
|
||||
}
|
||||
drop(diagnosed);
|
||||
|
||||
let issues = self.node_issues.read().await;
|
||||
|
||||
if let Some(node_issues) = issues.get(node_name) {
|
||||
let recent_count = node_issues
|
||||
.iter()
|
||||
.filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(60))
|
||||
.count();
|
||||
|
||||
if recent_count >= 5 {
|
||||
let affected_pods: Vec<String> = node_issues
|
||||
.iter()
|
||||
.filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(60))
|
||||
.map(|(name, _)| name.clone())
|
||||
.collect();
|
||||
|
||||
warn!(
|
||||
node = %node_name,
|
||||
affected_pods = recent_count,
|
||||
"Detected mass failure on node"
|
||||
);
|
||||
|
||||
return (true, affected_pods);
|
||||
}
|
||||
}
|
||||
|
||||
(false, vec![])
|
||||
}
|
||||
|
||||
/// Marks a node as diagnosed (prevents duplicate diagnoses)
|
||||
pub async fn mark_node_diagnosed(&self, node_name: &str) {
|
||||
let mut diagnosed = self.diagnosed_nodes.write().await;
|
||||
diagnosed.insert(node_name.to_string(), Instant::now());
|
||||
debug!(node = %node_name, "Marked node as diagnosed");
|
||||
}
|
||||
|
||||
/// Clears recorded issues for a node (call when node becomes healthy)
|
||||
pub async fn clear_node_issues(&self, node_name: &str) {
|
||||
let mut issues = self.node_issues.write().await;
|
||||
let mut diagnosed = self.diagnosed_nodes.write().await;
|
||||
|
||||
if issues.remove(node_name).is_some() {
|
||||
info!(node = %node_name, "Cleared node issues - node recovered");
|
||||
}
|
||||
|
||||
// Also clear diagnosis marker
|
||||
diagnosed.remove(node_name);
|
||||
}
|
||||
|
||||
/// Gets count of recent issues on a node
|
||||
pub async fn get_recent_issue_count(&self, node_name: &str) -> usize {
|
||||
let issues = self.node_issues.read().await;
|
||||
|
||||
issues
|
||||
.get(node_name)
|
||||
.map(|node_issues| {
|
||||
node_issues
|
||||
.iter()
|
||||
.filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(300))
|
||||
.count()
|
||||
})
|
||||
.unwrap_or(0)
|
||||
}
|
||||
}
|
||||
86
src/events/formatter.rs
Normal file
86
src/events/formatter.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::debug;
|
||||
|
||||
/// Deduplicates and formats diagnosis results to avoid spam
|
||||
#[derive(Clone)]
|
||||
pub struct DiagnosisFormatter {
|
||||
// Hash of diagnosis content -> (timestamp, count)
|
||||
seen_diagnoses: Arc<RwLock<HashMap<String, (Instant, usize)>>>,
|
||||
}
|
||||
|
||||
impl DiagnosisFormatter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
seen_diagnoses: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if this diagnosis is a duplicate and should be suppressed
|
||||
/// Returns: (should_display, display_suffix)
|
||||
pub async fn should_display(&self, diagnosis: &str) -> (bool, Option<String>) {
|
||||
let diagnosis_hash = Self::hash_diagnosis(diagnosis);
|
||||
let mut seen = self.seen_diagnoses.write().await;
|
||||
|
||||
// Clean old entries (older than 10 minutes)
|
||||
seen.retain(|_, (timestamp, _)| timestamp.elapsed() < Duration::from_secs(600));
|
||||
|
||||
if let Some((last_seen, count)) = seen.get_mut(&diagnosis_hash) {
|
||||
// Similar diagnosis seen recently
|
||||
if last_seen.elapsed() < Duration::from_secs(300) {
|
||||
// Within 5 minutes - increment count and suppress
|
||||
*count += 1;
|
||||
*last_seen = Instant::now();
|
||||
debug!(
|
||||
hash = %diagnosis_hash,
|
||||
count = *count,
|
||||
"Suppressing duplicate diagnosis"
|
||||
);
|
||||
return (false, Some(format!(" (seen {} times in last 5min)", count)));
|
||||
} else {
|
||||
// More than 5 minutes - reset and show
|
||||
*last_seen = Instant::now();
|
||||
*count = 1;
|
||||
}
|
||||
} else {
|
||||
// First time seeing this diagnosis
|
||||
seen.insert(diagnosis_hash.clone(), (Instant::now(), 1));
|
||||
}
|
||||
|
||||
(true, None)
|
||||
}
|
||||
|
||||
/// Creates a simplified hash of diagnosis to detect duplicates
|
||||
/// Focuses on root cause rather than resource names
|
||||
fn hash_diagnosis(diagnosis: &str) -> String {
|
||||
// Extract key phrases from diagnosis
|
||||
let normalized = diagnosis
|
||||
.to_lowercase()
|
||||
.lines()
|
||||
.filter(|line| {
|
||||
line.contains("root cause:")
|
||||
|| line.contains("cause:")
|
||||
|| line.contains("problem:")
|
||||
|| line.contains("severity:")
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
// Simple hash based on content
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
normalized.hash(&mut hasher);
|
||||
format!("{:x}", hasher.finish())
|
||||
}
|
||||
|
||||
/// Periodically clean old entries
|
||||
pub async fn cleanup(&self) {
|
||||
let mut seen = self.seen_diagnoses.write().await;
|
||||
seen.retain(|_, (timestamp, _)| timestamp.elapsed() < Duration::from_secs(600));
|
||||
debug!("Cleaned up diagnosis cache, {} entries remain", seen.len());
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,41 @@
|
||||
use crate::ai::{AIClient, DiagnosticEngine};
|
||||
use crate::k8s::{KubeClient, NodeEvent, NodeEventType, PodEvent, PodEventType};
|
||||
use crate::telegram::TelegramNotifier;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use super::correlation::CorrelationEngine;
|
||||
use super::formatter::DiagnosisFormatter;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EventHandler {
|
||||
kube_client: KubeClient,
|
||||
ai_client: AIClient,
|
||||
// Semaphore to limit concurrent AI diagnoses
|
||||
diagnosis_semaphore: Arc<Semaphore>,
|
||||
// Correlation engine to detect patterns
|
||||
correlation: CorrelationEngine,
|
||||
// Formatter to avoid duplicate diagnoses
|
||||
formatter: DiagnosisFormatter,
|
||||
// Optional Telegram notifier
|
||||
telegram: Option<TelegramNotifier>,
|
||||
}
|
||||
|
||||
impl EventHandler {
|
||||
pub fn new(kube_client: KubeClient, ai_client: AIClient, max_concurrent: usize) -> Self {
|
||||
pub fn new(
|
||||
kube_client: KubeClient,
|
||||
ai_client: AIClient,
|
||||
max_concurrent: usize,
|
||||
telegram: Option<TelegramNotifier>,
|
||||
) -> Self {
|
||||
Self {
|
||||
kube_client,
|
||||
ai_client,
|
||||
diagnosis_semaphore: Arc::new(Semaphore::new(max_concurrent)),
|
||||
correlation: CorrelationEngine::new(),
|
||||
formatter: DiagnosisFormatter::new(),
|
||||
telegram,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,13 +65,73 @@ impl EventHandler {
|
||||
}
|
||||
NodeEventType::BecameReady => {
|
||||
info!(node = %event.node_name, "Node became Ready");
|
||||
// Clear correlation data for this node since it recovered
|
||||
self.correlation.clear_node_issues(&event.node_name).await;
|
||||
|
||||
// Mark as resolved in Telegram
|
||||
if let Some(ref telegram) = self.telegram {
|
||||
telegram.mark_node_resolved(&event.node_name).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle pod event and trigger AI diagnostics if needed
|
||||
pub async fn handle_pod_event(&self, event: PodEvent) {
|
||||
// First, get pod details to determine which node it's on
|
||||
let node_name = match self
|
||||
.kube_client
|
||||
.get_pod_details(&event.namespace, &event.pod_name)
|
||||
.await
|
||||
{
|
||||
Ok(details) => details.node_name,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
error = %e,
|
||||
"Failed to get pod details for correlation"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// If pod is on a node, record the issue for correlation
|
||||
if let Some(ref node) = node_name {
|
||||
self.correlation
|
||||
.record_pod_issue(node, format!("{}/{}", event.namespace, event.pod_name))
|
||||
.await;
|
||||
|
||||
// Check if this is part of a mass failure
|
||||
let (is_mass_failure, affected_pods) = self.correlation.is_mass_failure(node).await;
|
||||
|
||||
if is_mass_failure {
|
||||
info!(
|
||||
node = %node,
|
||||
affected_pods = affected_pods.len(),
|
||||
"Detected mass pod failure on node - diagnosing node instead"
|
||||
);
|
||||
|
||||
// Diagnose the node with context about affected pods
|
||||
self.diagnose_node_with_pods(node, affected_pods).await;
|
||||
return; // Don't diagnose individual pod
|
||||
}
|
||||
}
|
||||
|
||||
// Build problem description with node context
|
||||
let problem_description = match &event.event_type {
|
||||
PodEventType::Recovered => {
|
||||
info!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
"Pod recovered - marking as resolved"
|
||||
);
|
||||
// Mark as resolved in Telegram
|
||||
if let Some(ref telegram) = self.telegram {
|
||||
telegram.mark_pod_resolved(&event.namespace, &event.pod_name).await;
|
||||
}
|
||||
return; // No diagnosis needed for recovery
|
||||
}
|
||||
PodEventType::HighRestartCount { count } => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
@@ -117,7 +195,7 @@ impl EventHandler {
|
||||
}
|
||||
};
|
||||
|
||||
self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description)
|
||||
self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description, node_name.as_deref())
|
||||
.await;
|
||||
}
|
||||
|
||||
@@ -129,55 +207,138 @@ impl EventHandler {
|
||||
|
||||
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||
|
||||
match diagnostic_engine
|
||||
let diagnosis_opt = match diagnostic_engine
|
||||
.diagnose_nodes(&self.ai_client, vec![node_name.to_string()])
|
||||
.await
|
||||
{
|
||||
Ok(diagnosis) => {
|
||||
info!(
|
||||
node = %node_name,
|
||||
diagnosis = %diagnosis,
|
||||
"AI diagnosis completed"
|
||||
"Node diagnosis completed:\n{}", diagnosis
|
||||
);
|
||||
Some(diagnosis)
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string();
|
||||
error!(
|
||||
node = %node_name,
|
||||
error = %e,
|
||||
error = %error_msg,
|
||||
"AI diagnosis failed"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Send to Telegram if configured (after match is complete)
|
||||
if let Some(ref telegram) = self.telegram {
|
||||
if let Some(diagnosis) = diagnosis_opt {
|
||||
telegram.send_node_diagnosis(node_name, &diagnosis).await;
|
||||
}
|
||||
}
|
||||
// Permit is automatically released when _permit is dropped
|
||||
}
|
||||
|
||||
async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str) {
|
||||
async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str, node_name: Option<&str>) {
|
||||
// Acquire semaphore permit to limit concurrency
|
||||
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||
|
||||
info!(pod = %pod_name, namespace = %namespace, "Starting AI diagnosis (acquired permit)");
|
||||
info!(pod = %pod_name, namespace = %namespace, node = ?node_name, "Starting AI diagnosis (acquired permit)");
|
||||
|
||||
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||
|
||||
match diagnostic_engine
|
||||
.diagnose_pod(&self.ai_client, namespace, pod_name, problem)
|
||||
// Add node context to problem description if available
|
||||
let full_problem = if let Some(node) = node_name {
|
||||
format!("{} (Pod is on node: {})", problem, node)
|
||||
} else {
|
||||
problem.to_string()
|
||||
};
|
||||
|
||||
let diagnosis_opt = match diagnostic_engine
|
||||
.diagnose_pod(&self.ai_client, namespace, pod_name, &full_problem)
|
||||
.await
|
||||
{
|
||||
Ok(diagnosis) => {
|
||||
info!(
|
||||
pod = %pod_name,
|
||||
namespace = %namespace,
|
||||
diagnosis = %diagnosis,
|
||||
"AI diagnosis completed"
|
||||
"AI diagnosis completed:\n{}", diagnosis
|
||||
);
|
||||
Some(diagnosis)
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string();
|
||||
error!(
|
||||
pod = %pod_name,
|
||||
namespace = %namespace,
|
||||
error = %e,
|
||||
error = %error_msg,
|
||||
"AI diagnosis failed"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Send to Telegram if configured (after match is complete)
|
||||
if let Some(ref telegram) = self.telegram {
|
||||
if let Some(diagnosis) = diagnosis_opt {
|
||||
telegram.send_pod_diagnosis(namespace, pod_name, &diagnosis).await;
|
||||
}
|
||||
}
|
||||
// Permit is automatically released when _permit is dropped
|
||||
}
|
||||
|
||||
/// Diagnose a node with context about affected pods (mass failure scenario)
|
||||
async fn diagnose_node_with_pods(&self, node_name: &str, affected_pods: Vec<String>) {
|
||||
// Mark node as diagnosed to prevent duplicate diagnoses for subsequent pod events
|
||||
self.correlation.mark_node_diagnosed(node_name).await;
|
||||
|
||||
// Acquire semaphore permit to limit concurrency
|
||||
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||
|
||||
info!(
|
||||
node = %node_name,
|
||||
affected_pods = affected_pods.len(),
|
||||
"Starting grouped AI diagnosis (acquired permit)"
|
||||
);
|
||||
|
||||
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||
|
||||
// Create a detailed problem description for mass failure
|
||||
let problem_description = format!(
|
||||
"Node {} has issues affecting {} pods. Affected pods: {}. \
|
||||
This appears to be a node-level problem rather than individual pod issues. \
|
||||
Analyze the node state and determine the root cause.",
|
||||
node_name,
|
||||
affected_pods.len(),
|
||||
affected_pods.join(", ")
|
||||
);
|
||||
|
||||
let diagnosis_opt = match diagnostic_engine
|
||||
.diagnose_node_with_context(&self.ai_client, node_name, &problem_description)
|
||||
.await
|
||||
{
|
||||
Ok(diagnosis) => {
|
||||
info!(
|
||||
node = %node_name,
|
||||
affected_pods = affected_pods.len(),
|
||||
"Grouped diagnosis completed:\n{}", diagnosis
|
||||
);
|
||||
Some(diagnosis)
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string();
|
||||
error!(
|
||||
node = %node_name,
|
||||
error = %error_msg,
|
||||
"Grouped AI diagnosis failed"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Send to Telegram with grouped context (after match is complete)
|
||||
if let Some(ref telegram) = self.telegram {
|
||||
if let Some(diagnosis) = diagnosis_opt {
|
||||
telegram.send_grouped_diagnosis(node_name, affected_pods.len(), &diagnosis).await;
|
||||
}
|
||||
}
|
||||
// Permit is automatically released when _permit is dropped
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
mod correlation;
|
||||
mod formatter;
|
||||
mod handler;
|
||||
|
||||
pub use correlation::CorrelationEngine;
|
||||
pub use formatter::DiagnosisFormatter;
|
||||
pub use handler::EventHandler;
|
||||
|
||||
@@ -24,6 +24,7 @@ pub enum PodEventType {
|
||||
CrashLoopBackOff,
|
||||
ImagePullError,
|
||||
ContainerCreating { duration_seconds: i64 },
|
||||
Recovered, // Pod returned to healthy state
|
||||
}
|
||||
|
||||
pub struct PodWatcher {
|
||||
@@ -83,6 +84,24 @@ impl PodWatcher {
|
||||
.and_then(|s| s.phase.as_deref())
|
||||
.unwrap_or("Unknown");
|
||||
|
||||
// Check if pod is now healthy (Running with all containers ready)
|
||||
let is_healthy = phase == "Running" && Self::all_containers_ready(&pod);
|
||||
|
||||
// If pod was problematic and is now healthy - emit recovery event
|
||||
if is_healthy && self.reported_issues.contains_key(&key) {
|
||||
info!(
|
||||
pod = %name,
|
||||
namespace = %namespace,
|
||||
"Pod recovered from previous issue"
|
||||
);
|
||||
self.reported_issues.remove(&key);
|
||||
return Some(PodEvent {
|
||||
pod_name: name,
|
||||
namespace,
|
||||
event_type: PodEventType::Recovered,
|
||||
});
|
||||
}
|
||||
|
||||
// Helper to check if we should report this issue
|
||||
let should_report = |event_type: &PodEventType| -> bool {
|
||||
match self.reported_issues.get(&key) {
|
||||
@@ -290,4 +309,12 @@ impl PodWatcher {
|
||||
.and_then(|s| s.message.clone())
|
||||
})
|
||||
}
|
||||
|
||||
fn all_containers_ready(pod: &Pod) -> bool {
|
||||
pod.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.container_statuses.as_ref())
|
||||
.map(|cs| cs.iter().all(|c| c.ready))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,6 +73,21 @@ impl KubeClient {
|
||||
("Unknown".to_string(), None, None)
|
||||
};
|
||||
|
||||
// Extract resource requests/limits from pod spec
|
||||
let resources = pod.spec.as_ref().and_then(|spec| {
|
||||
spec.containers.iter().find(|container| container.name == c.name).and_then(|container| {
|
||||
container.resources.as_ref().map(|res| {
|
||||
use super::types::ContainerResources;
|
||||
ContainerResources {
|
||||
requests_cpu: res.requests.as_ref().and_then(|r| r.get("cpu").map(|q| q.0.clone())),
|
||||
requests_memory: res.requests.as_ref().and_then(|r| r.get("memory").map(|q| q.0.clone())),
|
||||
limits_cpu: res.limits.as_ref().and_then(|l| l.get("cpu").map(|q| q.0.clone())),
|
||||
limits_memory: res.limits.as_ref().and_then(|l| l.get("memory").map(|q| q.0.clone())),
|
||||
}
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
ContainerStatus {
|
||||
name: c.name.clone(),
|
||||
ready: c.ready,
|
||||
@@ -80,6 +95,7 @@ impl KubeClient {
|
||||
state,
|
||||
state_reason,
|
||||
state_message,
|
||||
resources,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
|
||||
@@ -45,6 +45,14 @@ pub struct PodCondition {
|
||||
pub message: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ContainerResources {
|
||||
pub requests_cpu: Option<String>,
|
||||
pub requests_memory: Option<String>,
|
||||
pub limits_cpu: Option<String>,
|
||||
pub limits_memory: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ContainerStatus {
|
||||
pub name: String,
|
||||
@@ -53,6 +61,7 @@ pub struct ContainerStatus {
|
||||
pub state: String,
|
||||
pub state_reason: Option<String>,
|
||||
pub state_message: Option<String>,
|
||||
pub resources: Option<ContainerResources>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
|
||||
14
src/main.rs
14
src/main.rs
@@ -2,6 +2,7 @@ mod ai;
|
||||
mod config;
|
||||
mod events;
|
||||
mod k8s;
|
||||
mod telegram;
|
||||
mod tools;
|
||||
|
||||
use ai::AIClient;
|
||||
@@ -11,6 +12,7 @@ use futures::StreamExt;
|
||||
use k8s::{KubeClient, NodeWatcher, PodWatcher};
|
||||
use tracing::{error, info};
|
||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||
use telegram::create_notifier;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
@@ -37,11 +39,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
"AI client initialized"
|
||||
);
|
||||
|
||||
// Initialize Telegram notifier (optional)
|
||||
let telegram = create_notifier(
|
||||
cfg.telegram_bot_token.clone(),
|
||||
cfg.telegram_chat_id.clone(),
|
||||
);
|
||||
if telegram.is_some() {
|
||||
info!("Telegram notifications enabled");
|
||||
} else {
|
||||
info!("Telegram notifications disabled (not configured)");
|
||||
}
|
||||
|
||||
// Create event handler with concurrency limit
|
||||
let event_handler = EventHandler::new(
|
||||
kube_client.clone(),
|
||||
ai_client,
|
||||
cfg.max_concurrent_diagnoses,
|
||||
telegram,
|
||||
);
|
||||
|
||||
// Start node watcher
|
||||
|
||||
201
src/telegram.rs
Normal file
201
src/telegram.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TelegramNotifier {
|
||||
bot_token: String,
|
||||
chat_id: String,
|
||||
client: reqwest::Client,
|
||||
// Store message IDs for resources to mark them as resolved later
|
||||
// Key format: "node:name" or "pod:namespace/name"
|
||||
message_ids: Arc<RwLock<HashMap<String, i64>>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SendMessageRequest {
|
||||
chat_id: String,
|
||||
text: String,
|
||||
parse_mode: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct SendMessageResponse {
|
||||
ok: bool,
|
||||
result: Option<MessageResult>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct MessageResult {
|
||||
message_id: i64,
|
||||
}
|
||||
|
||||
impl TelegramNotifier {
|
||||
pub fn new(bot_token: String, chat_id: String) -> Self {
|
||||
Self {
|
||||
bot_token,
|
||||
chat_id,
|
||||
client: reqwest::Client::new(),
|
||||
message_ids: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sends a node diagnosis notification to Telegram and stores message_id
|
||||
pub async fn send_node_diagnosis(&self, node: &str, diagnosis: &str) {
|
||||
// Format message with Markdown
|
||||
let message = format!("🔴 *Node Diagnosis*\n\n*Node:* `{}`\n\n```\n{}\n```", node, diagnosis);
|
||||
|
||||
if let Ok(message_id) = self.send_message(&message).await {
|
||||
debug!(node = %node, message_id = message_id, "Telegram node notification sent");
|
||||
// Store message_id for this node
|
||||
let key = format!("node:{}", node);
|
||||
self.message_ids.write().await.insert(key, message_id);
|
||||
} else {
|
||||
error!("Failed to send Telegram notification");
|
||||
}
|
||||
}
|
||||
|
||||
/// Sends a pod diagnosis notification and stores message_id
|
||||
pub async fn send_pod_diagnosis(&self, namespace: &str, pod_name: &str, diagnosis: &str) {
|
||||
let message = format!("🤖 *Pod Diagnosis*\n\n*Pod:* `{}/{}`\n\n```\n{}\n```", namespace, pod_name, diagnosis);
|
||||
|
||||
if let Ok(message_id) = self.send_message(&message).await {
|
||||
debug!(pod = %pod_name, namespace = %namespace, message_id = message_id, "Telegram pod notification sent");
|
||||
// Store message_id for this pod
|
||||
let key = format!("pod:{}/{}", namespace, pod_name);
|
||||
self.message_ids.write().await.insert(key, message_id);
|
||||
} else {
|
||||
error!("Failed to send Telegram notification");
|
||||
}
|
||||
}
|
||||
|
||||
/// Sends a grouped diagnosis notification for node issues
|
||||
pub async fn send_grouped_diagnosis(&self, node: &str, affected_pods: usize, diagnosis: &str) {
|
||||
let message = format!(
|
||||
"🔴 *Node Issue Detected*\n\n*Node:* `{}`\n*Affected Pods:* {}\n\n```\n{}\n```",
|
||||
node, affected_pods, diagnosis
|
||||
);
|
||||
|
||||
if let Ok(message_id) = self.send_message(&message).await {
|
||||
debug!(node = %node, message_id = message_id, "Telegram grouped notification sent");
|
||||
// Store message_id for this node
|
||||
let key = format!("node:{}", node);
|
||||
self.message_ids.write().await.insert(key, message_id);
|
||||
} else {
|
||||
error!("Failed to send Telegram notification");
|
||||
}
|
||||
}
|
||||
|
||||
/// Marks a node as resolved by editing the message
|
||||
pub async fn mark_node_resolved(&self, node: &str) {
|
||||
let key = format!("node:{}", node);
|
||||
let message_ids = self.message_ids.read().await;
|
||||
|
||||
if let Some(&message_id) = message_ids.get(&key) {
|
||||
drop(message_ids); // Release lock before async call
|
||||
|
||||
let resolved_text = format!("✅ *Node Recovered*\n\n*Node:* `{}`\n\n_Issue has been resolved_", node);
|
||||
if let Err(e) = self.edit_message(message_id, &resolved_text).await {
|
||||
warn!(node = %node, error = %e, "Failed to edit resolved message");
|
||||
} else {
|
||||
info!(node = %node, "Marked node as resolved in Telegram");
|
||||
// Remove from tracking after marking resolved
|
||||
self.message_ids.write().await.remove(&key);
|
||||
}
|
||||
} else {
|
||||
debug!(node = %node, "No message_id found for node (might not have been diagnosed via Telegram)");
|
||||
}
|
||||
}
|
||||
|
||||
/// Marks a pod as resolved by editing the message
|
||||
pub async fn mark_pod_resolved(&self, namespace: &str, pod_name: &str) {
|
||||
let key = format!("pod:{}/{}", namespace, pod_name);
|
||||
let message_ids = self.message_ids.read().await;
|
||||
|
||||
if let Some(&message_id) = message_ids.get(&key) {
|
||||
drop(message_ids); // Release lock before async call
|
||||
|
||||
let resolved_text = format!("✅ *Pod Recovered*\n\n*Pod:* `{}/{}`\n\n_Issue has been resolved_", namespace, pod_name);
|
||||
if let Err(e) = self.edit_message(message_id, &resolved_text).await {
|
||||
warn!(pod = %pod_name, namespace = %namespace, error = %e, "Failed to edit resolved message");
|
||||
} else {
|
||||
info!(pod = %pod_name, namespace = %namespace, "Marked pod as resolved in Telegram");
|
||||
// Remove from tracking after marking resolved
|
||||
self.message_ids.write().await.remove(&key);
|
||||
}
|
||||
} else {
|
||||
debug!(pod = %pod_name, namespace = %namespace, "No message_id found for pod");
|
||||
}
|
||||
}
|
||||
|
||||
async fn send_message(&self, text: &str) -> Result<i64, Box<dyn std::error::Error + Send + Sync>> {
|
||||
let url = format!("https://api.telegram.org/bot{}/sendMessage", self.bot_token);
|
||||
|
||||
let request = SendMessageRequest {
|
||||
chat_id: self.chat_id.clone(),
|
||||
text: text.to_string(),
|
||||
parse_mode: "Markdown".to_string(),
|
||||
};
|
||||
|
||||
let response = self.client
|
||||
.post(&url)
|
||||
.json(&request)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
return Err(format!("Telegram API error {}: {}", status, body).into());
|
||||
}
|
||||
|
||||
let response_data: SendMessageResponse = response.json().await?;
|
||||
|
||||
if response_data.ok {
|
||||
if let Some(result) = response_data.result {
|
||||
Ok(result.message_id)
|
||||
} else {
|
||||
Err("No message_id in response".into())
|
||||
}
|
||||
} else {
|
||||
Err("Telegram API returned ok=false".into())
|
||||
}
|
||||
}
|
||||
|
||||
async fn edit_message(&self, message_id: i64, text: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let url = format!("https://api.telegram.org/bot{}/editMessageText", self.bot_token);
|
||||
|
||||
let payload = serde_json::json!({
|
||||
"chat_id": self.chat_id,
|
||||
"message_id": message_id,
|
||||
"text": text,
|
||||
"parse_mode": "Markdown"
|
||||
});
|
||||
|
||||
let response = self.client
|
||||
.post(&url)
|
||||
.json(&payload)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
return Err(format!("Telegram API error {}: {}", status, body).into());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Optional wrapper - returns None if Telegram not configured
|
||||
pub fn create_notifier(bot_token: Option<String>, chat_id: Option<String>) -> Option<TelegramNotifier> {
|
||||
match (bot_token, chat_id) {
|
||||
(Some(token), Some(chat)) if !token.is_empty() && !chat.is_empty() => {
|
||||
Some(TelegramNotifier::new(token, chat))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user