Init

2025-12-24 02:46:22 +00:00
parent c14080ff4e
commit 016ba6583d
16 changed files with 1148 additions and 70 deletions
--- a/src/ai/diagnostics.rs
+++ b/src/ai/diagnostics.rs
@@ -83,6 +83,25 @@ impl DiagnosticEngine {
            .await
    }

+    /// Diagnoses a node with custom context (e.g., mass pod failures)
+    pub async fn diagnose_node_with_context(
+        &self,
+        ai_client: &super::AIClient,
+        _node_name: &str,
+        context: &str,
+    ) -> Result<String, Box<dyn std::error::Error>> {
+        let full_description = format!(
+            "{}. Use the get_node_details tool to inspect the node if needed.",
+            context
+        );
+
+        let tools: Vec<ChatCompletionTools> =
+            vec![ChatCompletionTools::Function(tools::get_node_details_tool())];
+
+        self.run_diagnosis(ai_client, full_description, tools)
+            .await
+    }
+
    async fn run_diagnosis(
        &self,
        ai_client: &super::AIClient,
--- a/src/config.rs
+++ b/src/config.rs
@@ -1,7 +1,7 @@
 use serde::Deserialize;
 use std::fs;

-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Config {
    pub api_base: String,
    pub api_key: String,
@@ -9,6 +9,8 @@ pub struct Config {
    pub system_prompt: String,
    #[serde(default = "default_max_concurrent_diagnoses")]
    pub max_concurrent_diagnoses: usize,
+    pub telegram_bot_token: Option<String>,
+    pub telegram_chat_id: Option<String>,
 }

 fn default_max_concurrent_diagnoses() -> usize {
--- a/src/events/correlation.rs
+++ b/src/events/correlation.rs
@@ -0,0 +1,117 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+use std::time::{Duration, Instant};
+use tracing::{debug, info, warn};
+
+/// Tracks issues by node to detect patterns and enable smart correlation
+#[derive(Clone)]
+pub struct CorrelationEngine {
+    // node_name -> (pod_name, timestamp)
+    node_issues: Arc<RwLock<HashMap<String, Vec<(String, Instant)>>>>,
+    // node_name -> timestamp of last diagnosis (to avoid duplicate diagnoses)
+    diagnosed_nodes: Arc<RwLock<HashMap<String, Instant>>>,
+}
+
+impl CorrelationEngine {
+    pub fn new() -> Self {
+        Self {
+            node_issues: Arc::new(RwLock::new(HashMap::new())),
+            diagnosed_nodes: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    /// Records a pod issue associated with a node
+    pub async fn record_pod_issue(&self, node_name: &str, pod_name: String) {
+        let mut issues = self.node_issues.write().await;
+        let node_issues = issues.entry(node_name.to_string()).or_insert_with(Vec::new);
+        
+        // Clean old entries (older than 5 minutes)
+        node_issues.retain(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(300));
+        
+        // Add new issue
+        node_issues.push((pod_name, Instant::now()));
+        
+        debug!(node = %node_name, count = node_issues.len(), "Recorded pod issue on node");
+    }
+
+    /// Checks if there's a mass failure on a node (>5 pods in last 60 seconds)
+    /// Also checks if we already diagnosed this node recently to avoid duplicates
+    pub async fn is_mass_failure(&self, node_name: &str) -> (bool, Vec<String>) {
+        // First check if we already diagnosed this node recently (within last 5 minutes)
+        let diagnosed = self.diagnosed_nodes.read().await;
+        if let Some(last_diagnosis) = diagnosed.get(node_name) {
+            if last_diagnosis.elapsed() < Duration::from_secs(300) {
+                debug!(
+                    node = %node_name,
+                    elapsed_secs = last_diagnosis.elapsed().as_secs(),
+                    "Skipping duplicate diagnosis - node recently diagnosed"
+                );
+                return (false, vec![]);
+            }
+        }
+        drop(diagnosed);
+
+        let issues = self.node_issues.read().await;
+        
+        if let Some(node_issues) = issues.get(node_name) {
+            let recent_count = node_issues
+                .iter()
+                .filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(60))
+                .count();
+            
+            if recent_count >= 5 {
+                let affected_pods: Vec<String> = node_issues
+                    .iter()
+                    .filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(60))
+                    .map(|(name, _)| name.clone())
+                    .collect();
+                
+                warn!(
+                    node = %node_name,
+                    affected_pods = recent_count,
+                    "Detected mass failure on node"
+                );
+                
+                return (true, affected_pods);
+            }
+        }
+        
+        (false, vec![])
+    }
+
+    /// Marks a node as diagnosed (prevents duplicate diagnoses)
+    pub async fn mark_node_diagnosed(&self, node_name: &str) {
+        let mut diagnosed = self.diagnosed_nodes.write().await;
+        diagnosed.insert(node_name.to_string(), Instant::now());
+        debug!(node = %node_name, "Marked node as diagnosed");
+    }
+
+    /// Clears recorded issues for a node (call when node becomes healthy)
+    pub async fn clear_node_issues(&self, node_name: &str) {
+        let mut issues = self.node_issues.write().await;
+        let mut diagnosed = self.diagnosed_nodes.write().await;
+        
+        if issues.remove(node_name).is_some() {
+            info!(node = %node_name, "Cleared node issues - node recovered");
+        }
+        
+        // Also clear diagnosis marker
+        diagnosed.remove(node_name);
+    }
+
+    /// Gets count of recent issues on a node
+    pub async fn get_recent_issue_count(&self, node_name: &str) -> usize {
+        let issues = self.node_issues.read().await;
+        
+        issues
+            .get(node_name)
+            .map(|node_issues| {
+                node_issues
+                    .iter()
+                    .filter(|(_, timestamp)| timestamp.elapsed() < Duration::from_secs(300))
+                    .count()
+            })
+            .unwrap_or(0)
+    }
+}
--- a/src/events/formatter.rs
+++ b/src/events/formatter.rs
@@ -0,0 +1,86 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+use std::time::{Duration, Instant};
+use tracing::debug;
+
+/// Deduplicates and formats diagnosis results to avoid spam
+#[derive(Clone)]
+pub struct DiagnosisFormatter {
+    // Hash of diagnosis content -> (timestamp, count)
+    seen_diagnoses: Arc<RwLock<HashMap<String, (Instant, usize)>>>,
+}
+
+impl DiagnosisFormatter {
+    pub fn new() -> Self {
+        Self {
+            seen_diagnoses: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    /// Checks if this diagnosis is a duplicate and should be suppressed
+    /// Returns: (should_display, display_suffix)
+    pub async fn should_display(&self, diagnosis: &str) -> (bool, Option<String>) {
+        let diagnosis_hash = Self::hash_diagnosis(diagnosis);
+        let mut seen = self.seen_diagnoses.write().await;
+
+        // Clean old entries (older than 10 minutes)
+        seen.retain(|_, (timestamp, _)| timestamp.elapsed() < Duration::from_secs(600));
+
+        if let Some((last_seen, count)) = seen.get_mut(&diagnosis_hash) {
+            // Similar diagnosis seen recently
+            if last_seen.elapsed() < Duration::from_secs(300) {
+                // Within 5 minutes - increment count and suppress
+                *count += 1;
+                *last_seen = Instant::now();
+                debug!(
+                    hash = %diagnosis_hash,
+                    count = *count,
+                    "Suppressing duplicate diagnosis"
+                );
+                return (false, Some(format!(" (seen {} times in last 5min)", count)));
+            } else {
+                // More than 5 minutes - reset and show
+                *last_seen = Instant::now();
+                *count = 1;
+            }
+        } else {
+            // First time seeing this diagnosis
+            seen.insert(diagnosis_hash.clone(), (Instant::now(), 1));
+        }
+
+        (true, None)
+    }
+
+    /// Creates a simplified hash of diagnosis to detect duplicates
+    /// Focuses on root cause rather than resource names
+    fn hash_diagnosis(diagnosis: &str) -> String {
+        // Extract key phrases from diagnosis
+        let normalized = diagnosis
+            .to_lowercase()
+            .lines()
+            .filter(|line| {
+                line.contains("root cause:") 
+                || line.contains("cause:") 
+                || line.contains("problem:")
+                || line.contains("severity:")
+            })
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        // Simple hash based on content
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+        
+        let mut hasher = DefaultHasher::new();
+        normalized.hash(&mut hasher);
+        format!("{:x}", hasher.finish())
+    }
+
+    /// Periodically clean old entries
+    pub async fn cleanup(&self) {
+        let mut seen = self.seen_diagnoses.write().await;
+        seen.retain(|_, (timestamp, _)| timestamp.elapsed() < Duration::from_secs(600));
+        debug!("Cleaned up diagnosis cache, {} entries remain", seen.len());
+    }
+}
--- a/src/events/handler.rs
+++ b/src/events/handler.rs
@@ -1,23 +1,41 @@
 use crate::ai::{AIClient, DiagnosticEngine};
 use crate::k8s::{KubeClient, NodeEvent, NodeEventType, PodEvent, PodEventType};
+use crate::telegram::TelegramNotifier;
 use std::sync::Arc;
 use tokio::sync::Semaphore;
 use tracing::{error, info, warn};

+use super::correlation::CorrelationEngine;
+use super::formatter::DiagnosisFormatter;
+
 #[derive(Clone)]
 pub struct EventHandler {
    kube_client: KubeClient,
    ai_client: AIClient,
    // Semaphore to limit concurrent AI diagnoses
    diagnosis_semaphore: Arc<Semaphore>,
+    // Correlation engine to detect patterns
+    correlation: CorrelationEngine,
+    // Formatter to avoid duplicate diagnoses
+    formatter: DiagnosisFormatter,
+    // Optional Telegram notifier
+    telegram: Option<TelegramNotifier>,
 }

 impl EventHandler {
-    pub fn new(kube_client: KubeClient, ai_client: AIClient, max_concurrent: usize) -> Self {
+    pub fn new(
+        kube_client: KubeClient,
+        ai_client: AIClient,
+        max_concurrent: usize,
+        telegram: Option<TelegramNotifier>,
+    ) -> Self {
        Self {
            kube_client,
            ai_client,
            diagnosis_semaphore: Arc::new(Semaphore::new(max_concurrent)),
+            correlation: CorrelationEngine::new(),
+            formatter: DiagnosisFormatter::new(),
+            telegram,
        }
    }

@@ -47,13 +65,73 @@ impl EventHandler {
            }
            NodeEventType::BecameReady => {
                info!(node = %event.node_name, "Node became Ready");
+                // Clear correlation data for this node since it recovered
+                self.correlation.clear_node_issues(&event.node_name).await;
+                
+                // Mark as resolved in Telegram
+                if let Some(ref telegram) = self.telegram {
+                    telegram.mark_node_resolved(&event.node_name).await;
+                }
            }
        }
    }

    /// Handle pod event and trigger AI diagnostics if needed
    pub async fn handle_pod_event(&self, event: PodEvent) {
+        // First, get pod details to determine which node it's on
+        let node_name = match self
+            .kube_client
+            .get_pod_details(&event.namespace, &event.pod_name)
+            .await
+        {
+            Ok(details) => details.node_name,
+            Err(e) => {
+                warn!(
+                    pod = %event.pod_name,
+                    namespace = %event.namespace,
+                    error = %e,
+                    "Failed to get pod details for correlation"
+                );
+                None
+            }
+        };
+
+        // If pod is on a node, record the issue for correlation
+        if let Some(ref node) = node_name {
+            self.correlation
+                .record_pod_issue(node, format!("{}/{}", event.namespace, event.pod_name))
+                .await;
+
+            // Check if this is part of a mass failure
+            let (is_mass_failure, affected_pods) = self.correlation.is_mass_failure(node).await;
+
+            if is_mass_failure {
+                info!(
+                    node = %node,
+                    affected_pods = affected_pods.len(),
+                    "Detected mass pod failure on node - diagnosing node instead"
+                );
+
+                // Diagnose the node with context about affected pods
+                self.diagnose_node_with_pods(node, affected_pods).await;
+                return; // Don't diagnose individual pod
+            }
+        }
+
+        // Build problem description with node context
        let problem_description = match &event.event_type {
+            PodEventType::Recovered => {
+                info!(
+                    pod = %event.pod_name,
+                    namespace = %event.namespace,
+                    "Pod recovered - marking as resolved"
+                );
+                // Mark as resolved in Telegram
+                if let Some(ref telegram) = self.telegram {
+                    telegram.mark_pod_resolved(&event.namespace, &event.pod_name).await;
+                }
+                return; // No diagnosis needed for recovery
+            }
            PodEventType::HighRestartCount { count } => {
                warn!(
                    pod = %event.pod_name,
@@ -117,7 +195,7 @@ impl EventHandler {
            }
        };

-        self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description)
+        self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description, node_name.as_deref())
            .await;
    }

@@ -129,55 +207,138 @@ impl EventHandler {
        
        let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());

-        match diagnostic_engine
+        let diagnosis_opt = match diagnostic_engine
            .diagnose_nodes(&self.ai_client, vec![node_name.to_string()])
            .await
        {
            Ok(diagnosis) => {
                info!(
                    node = %node_name,
-                    diagnosis = %diagnosis,
-                    "AI diagnosis completed"
+                    "Node diagnosis completed:\n{}", diagnosis
                );
+                Some(diagnosis)
            }
            Err(e) => {
+                let error_msg = e.to_string();
                error!(
                    node = %node_name,
-                    error = %e,
+                    error = %error_msg,
                    "AI diagnosis failed"
                );
+                None
+            }
+        };
+
+        // Send to Telegram if configured (after match is complete)
+        if let Some(ref telegram) = self.telegram {
+            if let Some(diagnosis) = diagnosis_opt {
+                telegram.send_node_diagnosis(node_name, &diagnosis).await;
            }
        }
        // Permit is automatically released when _permit is dropped
    }

-    async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str) {
+    async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str, node_name: Option<&str>) {
        // Acquire semaphore permit to limit concurrency
        let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
        
-        info!(pod = %pod_name, namespace = %namespace, "Starting AI diagnosis (acquired permit)");
+        info!(pod = %pod_name, namespace = %namespace, node = ?node_name, "Starting AI diagnosis (acquired permit)");
        
        let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());

-        match diagnostic_engine
-            .diagnose_pod(&self.ai_client, namespace, pod_name, problem)
+        // Add node context to problem description if available
+        let full_problem = if let Some(node) = node_name {
+            format!("{} (Pod is on node: {})", problem, node)
+        } else {
+            problem.to_string()
+        };
+
+        let diagnosis_opt = match diagnostic_engine
+            .diagnose_pod(&self.ai_client, namespace, pod_name, &full_problem)
            .await
        {
            Ok(diagnosis) => {
                info!(
                    pod = %pod_name,
                    namespace = %namespace,
-                    diagnosis = %diagnosis,
-                    "AI diagnosis completed"
+                    "AI diagnosis completed:\n{}", diagnosis
                );
+                Some(diagnosis)
            }
            Err(e) => {
+                let error_msg = e.to_string();
                error!(
                    pod = %pod_name,
                    namespace = %namespace,
-                    error = %e,
+                    error = %error_msg,
                    "AI diagnosis failed"
                );
+                None
+            }
+        };
+
+        // Send to Telegram if configured (after match is complete)
+        if let Some(ref telegram) = self.telegram {
+            if let Some(diagnosis) = diagnosis_opt {
+                telegram.send_pod_diagnosis(namespace, pod_name, &diagnosis).await;
+            }
+        }
+        // Permit is automatically released when _permit is dropped
+    }
+
+    /// Diagnose a node with context about affected pods (mass failure scenario)
+    async fn diagnose_node_with_pods(&self, node_name: &str, affected_pods: Vec<String>) {
+        // Mark node as diagnosed to prevent duplicate diagnoses for subsequent pod events
+        self.correlation.mark_node_diagnosed(node_name).await;
+        
+        // Acquire semaphore permit to limit concurrency
+        let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
+        
+        info!(
+            node = %node_name,
+            affected_pods = affected_pods.len(),
+            "Starting grouped AI diagnosis (acquired permit)"
+        );
+        
+        let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
+
+        // Create a detailed problem description for mass failure
+        let problem_description = format!(
+            "Node {} has issues affecting {} pods. Affected pods: {}. \
+            This appears to be a node-level problem rather than individual pod issues. \
+            Analyze the node state and determine the root cause.",
+            node_name,
+            affected_pods.len(),
+            affected_pods.join(", ")
+        );
+
+        let diagnosis_opt = match diagnostic_engine
+            .diagnose_node_with_context(&self.ai_client, node_name, &problem_description)
+            .await
+        {
+            Ok(diagnosis) => {
+                info!(
+                    node = %node_name,
+                    affected_pods = affected_pods.len(),
+                    "Grouped diagnosis completed:\n{}", diagnosis
+                );
+                Some(diagnosis)
+            }
+            Err(e) => {
+                let error_msg = e.to_string();
+                error!(
+                    node = %node_name,
+                    error = %error_msg,
+                    "Grouped AI diagnosis failed"
+                );
+                None
+            }
+        };
+
+        // Send to Telegram with grouped context (after match is complete)
+        if let Some(ref telegram) = self.telegram {
+            if let Some(diagnosis) = diagnosis_opt {
+                telegram.send_grouped_diagnosis(node_name, affected_pods.len(), &diagnosis).await;
            }
        }
        // Permit is automatically released when _permit is dropped
--- a/src/events/mod.rs
+++ b/src/events/mod.rs
@@ -1,3 +1,7 @@
+mod correlation;
+mod formatter;
 mod handler;

+pub use correlation::CorrelationEngine;
+pub use formatter::DiagnosisFormatter;
 pub use handler::EventHandler;
--- a/src/k8s/pod_watcher.rs
+++ b/src/k8s/pod_watcher.rs
@@ -24,6 +24,7 @@ pub enum PodEventType {
    CrashLoopBackOff,
    ImagePullError,
    ContainerCreating { duration_seconds: i64 },
+    Recovered, // Pod returned to healthy state
 }

 pub struct PodWatcher {
@@ -83,6 +84,24 @@ impl PodWatcher {
            .and_then(|s| s.phase.as_deref())
            .unwrap_or("Unknown");

+        // Check if pod is now healthy (Running with all containers ready)
+        let is_healthy = phase == "Running" && Self::all_containers_ready(&pod);
+
+        // If pod was problematic and is now healthy - emit recovery event
+        if is_healthy && self.reported_issues.contains_key(&key) {
+            info!(
+                pod = %name,
+                namespace = %namespace,
+                "Pod recovered from previous issue"
+            );
+            self.reported_issues.remove(&key);
+            return Some(PodEvent {
+                pod_name: name,
+                namespace,
+                event_type: PodEventType::Recovered,
+            });
+        }
+
        // Helper to check if we should report this issue
        let should_report = |event_type: &PodEventType| -> bool {
            match self.reported_issues.get(&key) {
@@ -290,4 +309,12 @@ impl PodWatcher {
                    .and_then(|s| s.message.clone())
            })
    }
+
+    fn all_containers_ready(pod: &Pod) -> bool {
+        pod.status
+            .as_ref()
+            .and_then(|s| s.container_statuses.as_ref())
+            .map(|cs| cs.iter().all(|c| c.ready))
+            .unwrap_or(false)
+    }
 }
--- a/src/k8s/pods.rs
+++ b/src/k8s/pods.rs
@@ -73,6 +73,21 @@ impl KubeClient {
                            ("Unknown".to_string(), None, None)
                        };

+                        // Extract resource requests/limits from pod spec
+                        let resources = pod.spec.as_ref().and_then(|spec| {
+                            spec.containers.iter().find(|container| container.name == c.name).and_then(|container| {
+                                container.resources.as_ref().map(|res| {
+                                    use super::types::ContainerResources;
+                                    ContainerResources {
+                                        requests_cpu: res.requests.as_ref().and_then(|r| r.get("cpu").map(|q| q.0.clone())),
+                                        requests_memory: res.requests.as_ref().and_then(|r| r.get("memory").map(|q| q.0.clone())),
+                                        limits_cpu: res.limits.as_ref().and_then(|l| l.get("cpu").map(|q| q.0.clone())),
+                                        limits_memory: res.limits.as_ref().and_then(|l| l.get("memory").map(|q| q.0.clone())),
+                                    }
+                                })
+                            })
+                        });
+
                        ContainerStatus {
                            name: c.name.clone(),
                            ready: c.ready,
@@ -80,6 +95,7 @@ impl KubeClient {
                            state,
                            state_reason,
                            state_message,
+                            resources,
                        }
                    })
                    .collect()
--- a/src/k8s/types.rs
+++ b/src/k8s/types.rs
@@ -45,6 +45,14 @@ pub struct PodCondition {
    pub message: Option<String>,
 }

+#[derive(Debug, Serialize)]
+pub struct ContainerResources {
+    pub requests_cpu: Option<String>,
+    pub requests_memory: Option<String>,
+    pub limits_cpu: Option<String>,
+    pub limits_memory: Option<String>,
+}
+
 #[derive(Debug, Serialize)]
 pub struct ContainerStatus {
    pub name: String,
@@ -53,6 +61,7 @@ pub struct ContainerStatus {
    pub state: String,
    pub state_reason: Option<String>,
    pub state_message: Option<String>,
+    pub resources: Option<ContainerResources>,
 }

 #[derive(Debug, Serialize)]
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,6 +2,7 @@ mod ai;
 mod config;
 mod events;
 mod k8s;
+mod telegram;
 mod tools;

 use ai::AIClient;
@@ -11,6 +12,7 @@ use futures::StreamExt;
 use k8s::{KubeClient, NodeWatcher, PodWatcher};
 use tracing::{error, info};
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
+use telegram::create_notifier;

 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -37,11 +39,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        "AI client initialized"
    );

+    // Initialize Telegram notifier (optional)
+    let telegram = create_notifier(
+        cfg.telegram_bot_token.clone(),
+        cfg.telegram_chat_id.clone(),
+    );
+    if telegram.is_some() {
+        info!("Telegram notifications enabled");
+    } else {
+        info!("Telegram notifications disabled (not configured)");
+    }
+
    // Create event handler with concurrency limit
    let event_handler = EventHandler::new(
        kube_client.clone(),
        ai_client,
        cfg.max_concurrent_diagnoses,
+        telegram,
    );

    // Start node watcher
--- a/src/telegram.rs
+++ b/src/telegram.rs
@@ -0,0 +1,201 @@
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+use tracing::{debug, error, info, warn};
+
+#[derive(Clone)]
+pub struct TelegramNotifier {
+    bot_token: String,
+    chat_id: String,
+    client: reqwest::Client,
+    // Store message IDs for resources to mark them as resolved later
+    // Key format: "node:name" or "pod:namespace/name"
+    message_ids: Arc<RwLock<HashMap<String, i64>>>,
+}
+
+#[derive(Serialize)]
+struct SendMessageRequest {
+    chat_id: String,
+    text: String,
+    parse_mode: String,
+}
+
+#[derive(Deserialize)]
+struct SendMessageResponse {
+    ok: bool,
+    result: Option<MessageResult>,
+}
+
+#[derive(Deserialize)]
+struct MessageResult {
+    message_id: i64,
+}
+
+impl TelegramNotifier {
+    pub fn new(bot_token: String, chat_id: String) -> Self {
+        Self {
+            bot_token,
+            chat_id,
+            client: reqwest::Client::new(),
+            message_ids: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    /// Sends a node diagnosis notification to Telegram and stores message_id
+    pub async fn send_node_diagnosis(&self, node: &str, diagnosis: &str) {
+        // Format message with Markdown
+        let message = format!("🔴 *Node Diagnosis*\n\n*Node:* `{}`\n\n```\n{}\n```", node, diagnosis);
+        
+        if let Ok(message_id) = self.send_message(&message).await {
+            debug!(node = %node, message_id = message_id, "Telegram node notification sent");
+            // Store message_id for this node
+            let key = format!("node:{}", node);
+            self.message_ids.write().await.insert(key, message_id);
+        } else {
+            error!("Failed to send Telegram notification");
+        }
+    }
+
+    /// Sends a pod diagnosis notification and stores message_id
+    pub async fn send_pod_diagnosis(&self, namespace: &str, pod_name: &str, diagnosis: &str) {
+        let message = format!("🤖 *Pod Diagnosis*\n\n*Pod:* `{}/{}`\n\n```\n{}\n```", namespace, pod_name, diagnosis);
+        
+        if let Ok(message_id) = self.send_message(&message).await {
+            debug!(pod = %pod_name, namespace = %namespace, message_id = message_id, "Telegram pod notification sent");
+            // Store message_id for this pod
+            let key = format!("pod:{}/{}", namespace, pod_name);
+            self.message_ids.write().await.insert(key, message_id);
+        } else {
+            error!("Failed to send Telegram notification");
+        }
+    }
+
+    /// Sends a grouped diagnosis notification for node issues
+    pub async fn send_grouped_diagnosis(&self, node: &str, affected_pods: usize, diagnosis: &str) {
+        let message = format!(
+            "🔴 *Node Issue Detected*\n\n*Node:* `{}`\n*Affected Pods:* {}\n\n```\n{}\n```",
+            node, affected_pods, diagnosis
+        );
+        
+        if let Ok(message_id) = self.send_message(&message).await {
+            debug!(node = %node, message_id = message_id, "Telegram grouped notification sent");
+            // Store message_id for this node
+            let key = format!("node:{}", node);
+            self.message_ids.write().await.insert(key, message_id);
+        } else {
+            error!("Failed to send Telegram notification");
+        }
+    }
+
+    /// Marks a node as resolved by editing the message
+    pub async fn mark_node_resolved(&self, node: &str) {
+        let key = format!("node:{}", node);
+        let message_ids = self.message_ids.read().await;
+        
+        if let Some(&message_id) = message_ids.get(&key) {
+            drop(message_ids); // Release lock before async call
+            
+            let resolved_text = format!("✅ *Node Recovered*\n\n*Node:* `{}`\n\n_Issue has been resolved_", node);
+            if let Err(e) = self.edit_message(message_id, &resolved_text).await {
+                warn!(node = %node, error = %e, "Failed to edit resolved message");
+            } else {
+                info!(node = %node, "Marked node as resolved in Telegram");
+                // Remove from tracking after marking resolved
+                self.message_ids.write().await.remove(&key);
+            }
+        } else {
+            debug!(node = %node, "No message_id found for node (might not have been diagnosed via Telegram)");
+        }
+    }
+
+    /// Marks a pod as resolved by editing the message
+    pub async fn mark_pod_resolved(&self, namespace: &str, pod_name: &str) {
+        let key = format!("pod:{}/{}", namespace, pod_name);
+        let message_ids = self.message_ids.read().await;
+        
+        if let Some(&message_id) = message_ids.get(&key) {
+            drop(message_ids); // Release lock before async call
+            
+            let resolved_text = format!("✅ *Pod Recovered*\n\n*Pod:* `{}/{}`\n\n_Issue has been resolved_", namespace, pod_name);
+            if let Err(e) = self.edit_message(message_id, &resolved_text).await {
+                warn!(pod = %pod_name, namespace = %namespace, error = %e, "Failed to edit resolved message");
+            } else {
+                info!(pod = %pod_name, namespace = %namespace, "Marked pod as resolved in Telegram");
+                // Remove from tracking after marking resolved
+                self.message_ids.write().await.remove(&key);
+            }
+        } else {
+            debug!(pod = %pod_name, namespace = %namespace, "No message_id found for pod");
+        }
+    }
+
+    async fn send_message(&self, text: &str) -> Result<i64, Box<dyn std::error::Error + Send + Sync>> {
+        let url = format!("https://api.telegram.org/bot{}/sendMessage", self.bot_token);
+        
+        let request = SendMessageRequest {
+            chat_id: self.chat_id.clone(),
+            text: text.to_string(),
+            parse_mode: "Markdown".to_string(),
+        };
+
+        let response = self.client
+            .post(&url)
+            .json(&request)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await?;
+            return Err(format!("Telegram API error {}: {}", status, body).into());
+        }
+
+        let response_data: SendMessageResponse = response.json().await?;
+        
+        if response_data.ok {
+            if let Some(result) = response_data.result {
+                Ok(result.message_id)
+            } else {
+                Err("No message_id in response".into())
+            }
+        } else {
+            Err("Telegram API returned ok=false".into())
+        }
+    }
+
+    async fn edit_message(&self, message_id: i64, text: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        let url = format!("https://api.telegram.org/bot{}/editMessageText", self.bot_token);
+        
+        let payload = serde_json::json!({
+            "chat_id": self.chat_id,
+            "message_id": message_id,
+            "text": text,
+            "parse_mode": "Markdown"
+        });
+
+        let response = self.client
+            .post(&url)
+            .json(&payload)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await?;
+            return Err(format!("Telegram API error {}: {}", status, body).into());
+        }
+
+        Ok(())
+    }
+}
+
+/// Optional wrapper - returns None if Telegram not configured
+pub fn create_notifier(bot_token: Option<String>, chat_id: Option<String>) -> Option<TelegramNotifier> {
+    match (bot_token, chat_id) {
+        (Some(token), Some(chat)) if !token.is_empty() && !chat.is_empty() => {
+            Some(TelegramNotifier::new(token, chat))
+        }
+        _ => None,
+    }
+}