Init
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
/target
|
||||||
|
.crush
|
||||||
2883
Cargo.lock
generated
Normal file
2883
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[package]
|
||||||
|
name = "duty-ai-ops"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
async-openai = { version = "0.32", features = ["chat-completion"] }
|
||||||
|
tokio = { version = "1", features = ["full", "signal"] }
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
serde_json = "1"
|
||||||
|
toml = "0.8"
|
||||||
|
kube = { version = "0.97", features = ["runtime", "derive"] }
|
||||||
|
k8s-openapi = { version = "0.23", features = ["v1_31"] }
|
||||||
|
tracing = "0.1"
|
||||||
|
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
|
||||||
|
futures = "0.3"
|
||||||
|
chrono = "0.4"
|
||||||
10
config.toml
Normal file
10
config.toml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# API Configuration
|
||||||
|
api_base = "http://localhost:11434/v1"
|
||||||
|
api_key = "ollama"
|
||||||
|
model = "qwen3-tools:latest"
|
||||||
|
|
||||||
|
# AI concurrency settings
|
||||||
|
max_concurrent_diagnoses = 1 # Maximum parallel AI diagnosis requests
|
||||||
|
|
||||||
|
# System prompt for the AI assistant
|
||||||
|
system_prompt = "You are a Kubernetes diagnostic system. Analyze issues using provided tools and output ONLY a brief technical diagnosis in plain text. Rules: 1) Call tools to gather data. 2) After getting data, output diagnosis as plain text - NO XML, NO function calls in output. 3) Format: 'Resource [name] - [problem]. Cause: [technical reason].' 4) Maximum 3 sentences. 5) If tool fails with error, diagnose based on error message. 6) Output must be plain text diagnosis, not tool calls."
|
||||||
57
src/ai/client.rs
Normal file
57
src/ai/client.rs
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
use crate::config::Config;
|
||||||
|
use async_openai::{
|
||||||
|
config::OpenAIConfig,
|
||||||
|
types::chat::{
|
||||||
|
ChatCompletionRequestMessage, ChatCompletionRequestSystemMessageArgs,
|
||||||
|
ChatCompletionRequestUserMessageArgs,
|
||||||
|
},
|
||||||
|
Client,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AIClient {
|
||||||
|
client: Client<OpenAIConfig>,
|
||||||
|
model: String,
|
||||||
|
system_prompt: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AIClient {
|
||||||
|
pub fn new(cfg: &Config) -> Self {
|
||||||
|
let config = OpenAIConfig::new()
|
||||||
|
.with_api_base(&cfg.api_base)
|
||||||
|
.with_api_key(&cfg.api_key);
|
||||||
|
|
||||||
|
let client = Client::with_config(config);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
client,
|
||||||
|
model: cfg.model.clone(),
|
||||||
|
system_prompt: cfg.system_prompt.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn client(&self) -> &Client<OpenAIConfig> {
|
||||||
|
&self.client
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn model(&self) -> &str {
|
||||||
|
&self.model
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn system_message(&self) -> Result<ChatCompletionRequestMessage, Box<dyn std::error::Error>> {
|
||||||
|
Ok(ChatCompletionRequestSystemMessageArgs::default()
|
||||||
|
.content(self.system_prompt.clone())
|
||||||
|
.build()?
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn user_message(
|
||||||
|
&self,
|
||||||
|
content: String,
|
||||||
|
) -> Result<ChatCompletionRequestMessage, Box<dyn std::error::Error>> {
|
||||||
|
Ok(ChatCompletionRequestUserMessageArgs::default()
|
||||||
|
.content(content)
|
||||||
|
.build()?
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
229
src/ai/diagnostics.rs
Normal file
229
src/ai/diagnostics.rs
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
use crate::k8s::KubeClient;
|
||||||
|
use crate::tools;
|
||||||
|
use async_openai::{
|
||||||
|
types::chat::{
|
||||||
|
ChatCompletionMessageToolCalls, ChatCompletionRequestAssistantMessageArgs,
|
||||||
|
ChatCompletionRequestMessage, ChatCompletionRequestToolMessageArgs,
|
||||||
|
ChatCompletionTools, CreateChatCompletionRequestArgs,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tracing::{debug, warn};
|
||||||
|
|
||||||
|
pub struct DiagnosticEngine {
|
||||||
|
kube_client: KubeClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct NodeToolArgs {
|
||||||
|
node_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct PodDetailsArgs {
|
||||||
|
namespace: String,
|
||||||
|
pod_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct PodLogsArgs {
|
||||||
|
namespace: String,
|
||||||
|
pod_name: String,
|
||||||
|
container_name: Option<String>,
|
||||||
|
tail_lines: Option<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DiagnosticEngine {
|
||||||
|
pub fn new(kube_client: KubeClient) -> Self {
|
||||||
|
Self { kube_client }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Diagnoses NotReady nodes using AI with tool calling capability
|
||||||
|
pub async fn diagnose_nodes(
|
||||||
|
&self,
|
||||||
|
ai_client: &super::AIClient,
|
||||||
|
not_ready_nodes: Vec<String>,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
let problem_description = format!(
|
||||||
|
"The following Kubernetes nodes are in NotReady state: {}. \
|
||||||
|
Analyze the issue and determine the root cause. \
|
||||||
|
Use the get_node_details tool to inspect nodes if needed.",
|
||||||
|
not_ready_nodes.join(", ")
|
||||||
|
);
|
||||||
|
|
||||||
|
let tools: Vec<ChatCompletionTools> =
|
||||||
|
vec![ChatCompletionTools::Function(tools::get_node_details_tool())];
|
||||||
|
|
||||||
|
self.run_diagnosis(ai_client, problem_description, tools)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Diagnoses problematic pods using AI with tool calling capability
|
||||||
|
pub async fn diagnose_pod(
|
||||||
|
&self,
|
||||||
|
ai_client: &super::AIClient,
|
||||||
|
namespace: &str,
|
||||||
|
pod_name: &str,
|
||||||
|
problem_description: &str,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
let full_description = format!(
|
||||||
|
"Pod {}/{} has the following issue: {}. \
|
||||||
|
Analyze the problem and determine the root cause. \
|
||||||
|
Use get_pod_details to inspect the pod state and get_pod_logs to examine recent logs.",
|
||||||
|
namespace, pod_name, problem_description
|
||||||
|
);
|
||||||
|
|
||||||
|
let tools: Vec<ChatCompletionTools> = vec![
|
||||||
|
ChatCompletionTools::Function(tools::get_pod_details_tool()),
|
||||||
|
ChatCompletionTools::Function(tools::get_pod_logs_tool()),
|
||||||
|
];
|
||||||
|
|
||||||
|
self.run_diagnosis(ai_client, full_description, tools)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_diagnosis(
|
||||||
|
&self,
|
||||||
|
ai_client: &super::AIClient,
|
||||||
|
problem_description: String,
|
||||||
|
tools: Vec<ChatCompletionTools>,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
// Add timeout for entire diagnosis
|
||||||
|
let diagnosis = tokio::time::timeout(
|
||||||
|
Duration::from_secs(60),
|
||||||
|
self.run_diagnosis_inner(ai_client, problem_description, tools),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match diagnosis {
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(_) => {
|
||||||
|
warn!("AI diagnosis timed out after 60 seconds");
|
||||||
|
Err("Diagnosis timeout".into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_diagnosis_inner(
|
||||||
|
&self,
|
||||||
|
ai_client: &super::AIClient,
|
||||||
|
problem_description: String,
|
||||||
|
tools: Vec<ChatCompletionTools>,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
|
||||||
|
ai_client.system_message()?,
|
||||||
|
ai_client.user_message(problem_description)?,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Conversation loop
|
||||||
|
for iteration in 0..10 {
|
||||||
|
debug!(iteration = iteration + 1, "AI diagnosis iteration");
|
||||||
|
|
||||||
|
let request = CreateChatCompletionRequestArgs::default()
|
||||||
|
.model(ai_client.model())
|
||||||
|
.messages(messages.clone())
|
||||||
|
.tools(tools.clone())
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
debug!("Sending request to AI");
|
||||||
|
let response = ai_client.client().chat().create(request).await?;
|
||||||
|
debug!("Received response from AI");
|
||||||
|
|
||||||
|
let choice = response.choices.first().ok_or("No response from AI")?;
|
||||||
|
let assistant_message = choice.message.clone();
|
||||||
|
|
||||||
|
if let Some(tool_calls) = &assistant_message.tool_calls {
|
||||||
|
debug!(tool_count = tool_calls.len(), "AI calling tools");
|
||||||
|
|
||||||
|
messages.push(
|
||||||
|
ChatCompletionRequestAssistantMessageArgs::default()
|
||||||
|
.tool_calls(tool_calls.clone())
|
||||||
|
.build()?
|
||||||
|
.into(),
|
||||||
|
);
|
||||||
|
|
||||||
|
for tool_call_enum in tool_calls {
|
||||||
|
if let ChatCompletionMessageToolCalls::Function(tool_call) = tool_call_enum {
|
||||||
|
let tool_name = &tool_call.function.name;
|
||||||
|
let tool_args = &tool_call.function.arguments;
|
||||||
|
|
||||||
|
debug!(tool = %tool_name, args = %tool_args, "Executing tool");
|
||||||
|
|
||||||
|
// Execute tool with error handling
|
||||||
|
let result = match self.execute_tool(tool_name, tool_args).await {
|
||||||
|
Ok(data) => {
|
||||||
|
debug!(tool = %tool_name, "Tool executed successfully");
|
||||||
|
data
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!(tool = %tool_name, error = %e, "Tool execution failed");
|
||||||
|
// Return error as tool result instead of failing
|
||||||
|
format!(
|
||||||
|
"{{\"error\": \"Tool execution failed: {}\", \"available\": false}}",
|
||||||
|
e.to_string().replace('"', "'")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
messages.push(
|
||||||
|
ChatCompletionRequestToolMessageArgs::default()
|
||||||
|
.content(result)
|
||||||
|
.tool_call_id(tool_call.id.clone())
|
||||||
|
.build()?
|
||||||
|
.into(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if let Some(content) = &assistant_message.content {
|
||||||
|
debug!("AI provided final diagnosis");
|
||||||
|
return Ok(content.clone());
|
||||||
|
} else {
|
||||||
|
warn!("AI returned no content and no tool calls");
|
||||||
|
return Err("AI returned no content and no tool calls".into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
warn!("Maximum iterations reached without final answer");
|
||||||
|
Err("Maximum iterations reached without final answer".into())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn execute_tool(
|
||||||
|
&self,
|
||||||
|
tool_name: &str,
|
||||||
|
arguments_json: &str,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
match tool_name {
|
||||||
|
"get_node_details" => {
|
||||||
|
let args: NodeToolArgs = serde_json::from_str(arguments_json)?;
|
||||||
|
let details = self.kube_client.get_node_details(&args.node_name).await?;
|
||||||
|
Ok(serde_json::to_string_pretty(&details)?)
|
||||||
|
}
|
||||||
|
"get_pod_details" => {
|
||||||
|
let args: PodDetailsArgs = serde_json::from_str(arguments_json)?;
|
||||||
|
let details = self
|
||||||
|
.kube_client
|
||||||
|
.get_pod_details(&args.namespace, &args.pod_name)
|
||||||
|
.await?;
|
||||||
|
Ok(serde_json::to_string_pretty(&details)?)
|
||||||
|
}
|
||||||
|
"get_pod_logs" => {
|
||||||
|
let args: PodLogsArgs = serde_json::from_str(arguments_json)?;
|
||||||
|
let tail_lines = args.tail_lines.or(Some(50));
|
||||||
|
let logs = self
|
||||||
|
.kube_client
|
||||||
|
.get_pod_logs(
|
||||||
|
&args.namespace,
|
||||||
|
&args.pod_name,
|
||||||
|
args.container_name.as_deref(),
|
||||||
|
tail_lines,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(serde_json::to_string_pretty(&logs)?)
|
||||||
|
}
|
||||||
|
_ => Err(format!("Unknown tool: {}", tool_name).into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
5
src/ai/mod.rs
Normal file
5
src/ai/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
mod client;
|
||||||
|
mod diagnostics;
|
||||||
|
|
||||||
|
pub use client::AIClient;
|
||||||
|
pub use diagnostics::DiagnosticEngine;
|
||||||
24
src/config.rs
Normal file
24
src/config.rs
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
use serde::Deserialize;
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Config {
|
||||||
|
pub api_base: String,
|
||||||
|
pub api_key: String,
|
||||||
|
pub model: String,
|
||||||
|
pub system_prompt: String,
|
||||||
|
#[serde(default = "default_max_concurrent_diagnoses")]
|
||||||
|
pub max_concurrent_diagnoses: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_concurrent_diagnoses() -> usize {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
pub fn load(path: &str) -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
|
let content = fs::read_to_string(path)?;
|
||||||
|
let config: Config = toml::from_str(&content)?;
|
||||||
|
Ok(config)
|
||||||
|
}
|
||||||
|
}
|
||||||
185
src/events/handler.rs
Normal file
185
src/events/handler.rs
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
use crate::ai::{AIClient, DiagnosticEngine};
|
||||||
|
use crate::k8s::{KubeClient, NodeEvent, NodeEventType, PodEvent, PodEventType};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Semaphore;
|
||||||
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct EventHandler {
|
||||||
|
kube_client: KubeClient,
|
||||||
|
ai_client: AIClient,
|
||||||
|
// Semaphore to limit concurrent AI diagnoses
|
||||||
|
diagnosis_semaphore: Arc<Semaphore>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EventHandler {
|
||||||
|
pub fn new(kube_client: KubeClient, ai_client: AIClient, max_concurrent: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
kube_client,
|
||||||
|
ai_client,
|
||||||
|
diagnosis_semaphore: Arc::new(Semaphore::new(max_concurrent)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle node event and trigger AI diagnostics if needed
|
||||||
|
pub async fn handle_node_event(&self, event: NodeEvent) {
|
||||||
|
match event.event_type {
|
||||||
|
NodeEventType::BecameNotReady => {
|
||||||
|
warn!(
|
||||||
|
node = %event.node_name,
|
||||||
|
"Node became NotReady, starting AI diagnostics"
|
||||||
|
);
|
||||||
|
self.diagnose_node(&event.node_name).await;
|
||||||
|
}
|
||||||
|
NodeEventType::ConditionChanged {
|
||||||
|
ref condition_type,
|
||||||
|
ref status,
|
||||||
|
ref reason,
|
||||||
|
} => {
|
||||||
|
warn!(
|
||||||
|
node = %event.node_name,
|
||||||
|
condition = %condition_type,
|
||||||
|
status = %status,
|
||||||
|
reason = ?reason,
|
||||||
|
"Problematic condition detected, starting AI diagnostics"
|
||||||
|
);
|
||||||
|
self.diagnose_node(&event.node_name).await;
|
||||||
|
}
|
||||||
|
NodeEventType::BecameReady => {
|
||||||
|
info!(node = %event.node_name, "Node became Ready");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle pod event and trigger AI diagnostics if needed
|
||||||
|
pub async fn handle_pod_event(&self, event: PodEvent) {
|
||||||
|
let problem_description = match &event.event_type {
|
||||||
|
PodEventType::HighRestartCount { count } => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
restart_count = count,
|
||||||
|
"Pod has high restart count"
|
||||||
|
);
|
||||||
|
format!("High restart count: {} restarts", count)
|
||||||
|
}
|
||||||
|
PodEventType::CrashLoopBackOff => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
"Pod in CrashLoopBackOff"
|
||||||
|
);
|
||||||
|
"Container is in CrashLoopBackOff state".to_string()
|
||||||
|
}
|
||||||
|
PodEventType::ImagePullError => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
"Pod has image pull error"
|
||||||
|
);
|
||||||
|
"Failed to pull container image".to_string()
|
||||||
|
}
|
||||||
|
PodEventType::Pending { reason } => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
reason = ?reason,
|
||||||
|
"Pod stuck in Pending"
|
||||||
|
);
|
||||||
|
format!(
|
||||||
|
"Pod stuck in Pending state. Reason: {}",
|
||||||
|
reason.as_deref().unwrap_or("Unknown")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
PodEventType::Failed { reason } => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
reason = ?reason,
|
||||||
|
"Pod in Failed state"
|
||||||
|
);
|
||||||
|
format!(
|
||||||
|
"Pod in Failed state. Reason: {}",
|
||||||
|
reason.as_deref().unwrap_or("Unknown")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
PodEventType::ContainerCreating { duration_seconds } => {
|
||||||
|
warn!(
|
||||||
|
pod = %event.pod_name,
|
||||||
|
namespace = %event.namespace,
|
||||||
|
duration_seconds = duration_seconds,
|
||||||
|
"Container creating for too long"
|
||||||
|
);
|
||||||
|
format!(
|
||||||
|
"Container has been creating for {} seconds",
|
||||||
|
duration_seconds
|
||||||
|
)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn diagnose_node(&self, node_name: &str) {
|
||||||
|
// Acquire semaphore permit to limit concurrency
|
||||||
|
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||||
|
|
||||||
|
info!(node = %node_name, "Starting AI diagnosis (acquired permit)");
|
||||||
|
|
||||||
|
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||||
|
|
||||||
|
match diagnostic_engine
|
||||||
|
.diagnose_nodes(&self.ai_client, vec![node_name.to_string()])
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(diagnosis) => {
|
||||||
|
info!(
|
||||||
|
node = %node_name,
|
||||||
|
diagnosis = %diagnosis,
|
||||||
|
"AI diagnosis completed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(
|
||||||
|
node = %node_name,
|
||||||
|
error = %e,
|
||||||
|
"AI diagnosis failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Permit is automatically released when _permit is dropped
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str) {
|
||||||
|
// Acquire semaphore permit to limit concurrency
|
||||||
|
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||||
|
|
||||||
|
info!(pod = %pod_name, namespace = %namespace, "Starting AI diagnosis (acquired permit)");
|
||||||
|
|
||||||
|
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||||
|
|
||||||
|
match diagnostic_engine
|
||||||
|
.diagnose_pod(&self.ai_client, namespace, pod_name, problem)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(diagnosis) => {
|
||||||
|
info!(
|
||||||
|
pod = %pod_name,
|
||||||
|
namespace = %namespace,
|
||||||
|
diagnosis = %diagnosis,
|
||||||
|
"AI diagnosis completed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(
|
||||||
|
pod = %pod_name,
|
||||||
|
namespace = %namespace,
|
||||||
|
error = %e,
|
||||||
|
"AI diagnosis failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Permit is automatically released when _permit is dropped
|
||||||
|
}
|
||||||
|
}
|
||||||
3
src/events/mod.rs
Normal file
3
src/events/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
mod handler;
|
||||||
|
|
||||||
|
pub use handler::EventHandler;
|
||||||
20
src/k8s/client.rs
Normal file
20
src/k8s/client.rs
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
use kube::{Client, Config};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct KubeClient {
|
||||||
|
client: Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KubeClient {
|
||||||
|
/// Creates a client using standard credentials
|
||||||
|
/// (kubeconfig or service account from environment)
|
||||||
|
pub async fn new() -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
|
let config = Config::infer().await?;
|
||||||
|
let client = Client::try_from(config)?;
|
||||||
|
Ok(Self { client })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inner(&self) -> &Client {
|
||||||
|
&self.client
|
||||||
|
}
|
||||||
|
}
|
||||||
10
src/k8s/mod.rs
Normal file
10
src/k8s/mod.rs
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
mod client;
|
||||||
|
mod nodes;
|
||||||
|
mod pods;
|
||||||
|
mod pod_watcher;
|
||||||
|
mod types;
|
||||||
|
mod watcher;
|
||||||
|
|
||||||
|
pub use client::KubeClient;
|
||||||
|
pub use pod_watcher::{PodEvent, PodEventType, PodWatcher};
|
||||||
|
pub use watcher::{NodeEvent, NodeEventType, NodeWatcher};
|
||||||
181
src/k8s/nodes.rs
Normal file
181
src/k8s/nodes.rs
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
use super::client::KubeClient;
|
||||||
|
use super::types::{NodeCondition, NodeDetails, NodeResources, NodeStatus};
|
||||||
|
use k8s_openapi::api::core::v1::{Node, Pod};
|
||||||
|
use kube::{api::ListParams, Api};
|
||||||
|
|
||||||
|
impl KubeClient {
|
||||||
|
/// Gets list of all nodes and their statuses
|
||||||
|
pub async fn get_nodes(&self) -> Result<Vec<NodeStatus>, Box<dyn std::error::Error>> {
|
||||||
|
let nodes: Api<Node> = Api::all(self.inner().clone());
|
||||||
|
let node_list = nodes.list(&ListParams::default()).await?;
|
||||||
|
|
||||||
|
let mut result = Vec::new();
|
||||||
|
|
||||||
|
for node in node_list.items {
|
||||||
|
let name = node
|
||||||
|
.metadata
|
||||||
|
.name
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
|
|
||||||
|
let ready = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.and_then(|conditions| {
|
||||||
|
conditions
|
||||||
|
.iter()
|
||||||
|
.find(|c| c.type_ == "Ready")
|
||||||
|
.map(|c| c.status == "True")
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
let version = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.node_info.as_ref())
|
||||||
|
.map(|info| info.kubelet_version.clone())
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
|
|
||||||
|
let internal_ip = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.addresses.as_ref())
|
||||||
|
.and_then(|addresses| {
|
||||||
|
addresses
|
||||||
|
.iter()
|
||||||
|
.find(|a| a.type_ == "InternalIP")
|
||||||
|
.map(|a| a.address.clone())
|
||||||
|
});
|
||||||
|
|
||||||
|
result.push(NodeStatus {
|
||||||
|
name,
|
||||||
|
ready,
|
||||||
|
version,
|
||||||
|
internal_ip,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets detailed information about a specific node for diagnostics
|
||||||
|
pub async fn get_node_details(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
) -> Result<NodeDetails, Box<dyn std::error::Error>> {
|
||||||
|
let nodes: Api<Node> = Api::all(self.inner().clone());
|
||||||
|
let node = nodes.get(node_name).await?;
|
||||||
|
|
||||||
|
// Conditions
|
||||||
|
let conditions = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.map(|conds| {
|
||||||
|
conds
|
||||||
|
.iter()
|
||||||
|
.map(|c| NodeCondition {
|
||||||
|
type_: c.type_.clone(),
|
||||||
|
status: c.status.clone(),
|
||||||
|
reason: c.reason.clone(),
|
||||||
|
message: c.message.clone(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Resources - capacity
|
||||||
|
let capacity = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.capacity.as_ref())
|
||||||
|
.map(|c| NodeResources {
|
||||||
|
cpu: c.get("cpu").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
memory: c.get("memory").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
pods: c.get("pods").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
})
|
||||||
|
.unwrap_or(NodeResources {
|
||||||
|
cpu: "unknown".to_string(),
|
||||||
|
memory: "unknown".to_string(),
|
||||||
|
pods: "unknown".to_string(),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Resources - allocatable
|
||||||
|
let allocatable = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.allocatable.as_ref())
|
||||||
|
.map(|a| NodeResources {
|
||||||
|
cpu: a.get("cpu").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
memory: a.get("memory").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
pods: a.get("pods").map(|q| q.0.clone()).unwrap_or_default(),
|
||||||
|
})
|
||||||
|
.unwrap_or(NodeResources {
|
||||||
|
cpu: "unknown".to_string(),
|
||||||
|
memory: "unknown".to_string(),
|
||||||
|
pods: "unknown".to_string(),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Taints
|
||||||
|
let taints = node
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.taints.as_ref())
|
||||||
|
.map(|t| {
|
||||||
|
t.iter()
|
||||||
|
.map(|taint| {
|
||||||
|
format!(
|
||||||
|
"{}={} (effect: {})",
|
||||||
|
taint.key,
|
||||||
|
taint.value.as_deref().unwrap_or(""),
|
||||||
|
taint.effect
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Labels
|
||||||
|
let labels = node
|
||||||
|
.metadata
|
||||||
|
.labels
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// Count pods on the node
|
||||||
|
let pods: Api<Pod> = Api::all(self.inner().clone());
|
||||||
|
let pod_list = pods.list(&ListParams::default()).await?;
|
||||||
|
let pod_count = pod_list
|
||||||
|
.items
|
||||||
|
.iter()
|
||||||
|
.filter(|pod| {
|
||||||
|
pod.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.node_name.as_ref())
|
||||||
|
.map(|n| n == node_name)
|
||||||
|
.unwrap_or(false)
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
|
||||||
|
let version = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.node_info.as_ref())
|
||||||
|
.map(|info| info.kubelet_version.clone())
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
|
|
||||||
|
Ok(NodeDetails {
|
||||||
|
name: node_name.to_string(),
|
||||||
|
conditions,
|
||||||
|
capacity,
|
||||||
|
allocatable,
|
||||||
|
taints,
|
||||||
|
labels,
|
||||||
|
pod_count,
|
||||||
|
version,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
293
src/k8s/pod_watcher.rs
Normal file
293
src/k8s/pod_watcher.rs
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
use futures::TryStreamExt;
|
||||||
|
use k8s_openapi::api::core::v1::Pod;
|
||||||
|
use kube::{
|
||||||
|
runtime::{watcher, watcher::Config, WatchStreamExt},
|
||||||
|
Api, ResourceExt,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
|
use super::client::KubeClient;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct PodEvent {
|
||||||
|
pub pod_name: String,
|
||||||
|
pub namespace: String,
|
||||||
|
pub event_type: PodEventType,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum PodEventType {
|
||||||
|
HighRestartCount { count: i32 },
|
||||||
|
Pending { reason: Option<String> },
|
||||||
|
Failed { reason: Option<String> },
|
||||||
|
CrashLoopBackOff,
|
||||||
|
ImagePullError,
|
||||||
|
ContainerCreating { duration_seconds: i64 },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PodWatcher {
|
||||||
|
api: Api<Pod>,
|
||||||
|
// Track restart counts to detect increases
|
||||||
|
restart_counts: HashMap<String, i32>,
|
||||||
|
// Track already reported issues to prevent spam
|
||||||
|
reported_issues: HashMap<String, PodEventType>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PodWatcher {
|
||||||
|
pub fn new(kube_client: &KubeClient) -> Self {
|
||||||
|
let api = Api::all(kube_client.inner().clone());
|
||||||
|
Self {
|
||||||
|
api,
|
||||||
|
restart_counts: HashMap::new(),
|
||||||
|
reported_issues: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start watching pod events
|
||||||
|
pub async fn watch(
|
||||||
|
mut self,
|
||||||
|
) -> Result<
|
||||||
|
impl futures::Stream<Item = Result<PodEvent, Box<dyn std::error::Error + Send + Sync>>>,
|
||||||
|
Box<dyn std::error::Error>,
|
||||||
|
> {
|
||||||
|
info!("Starting pod watcher");
|
||||||
|
|
||||||
|
// Initialize current state
|
||||||
|
let pods = self.api.list(&Default::default()).await?;
|
||||||
|
for pod in pods.items {
|
||||||
|
let key = Self::pod_key(&pod);
|
||||||
|
let restart_count = Self::get_restart_count(&pod);
|
||||||
|
self.restart_counts.insert(key, restart_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
let stream = watcher(self.api.clone(), Config::default())
|
||||||
|
.applied_objects()
|
||||||
|
.try_filter_map(move |pod| {
|
||||||
|
let event = self.process_pod_event(pod);
|
||||||
|
futures::future::ready(Ok(event))
|
||||||
|
})
|
||||||
|
.map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>);
|
||||||
|
|
||||||
|
Ok(stream)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process_pod_event(&mut self, pod: Pod) -> Option<PodEvent> {
|
||||||
|
let name = pod.name_any();
|
||||||
|
let namespace = pod.namespace().unwrap_or_else(|| "default".to_string());
|
||||||
|
let key = format!("{}/{}", namespace, name);
|
||||||
|
|
||||||
|
let phase = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.phase.as_deref())
|
||||||
|
.unwrap_or("Unknown");
|
||||||
|
|
||||||
|
// Helper to check if we should report this issue
|
||||||
|
let should_report = |event_type: &PodEventType| -> bool {
|
||||||
|
match self.reported_issues.get(&key) {
|
||||||
|
Some(prev) if prev == event_type => false, // Already reported same issue
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check for high restart count
|
||||||
|
let current_restart_count = Self::get_restart_count(&pod);
|
||||||
|
let previous_restart_count = self.restart_counts.get(&key).copied().unwrap_or(0);
|
||||||
|
|
||||||
|
if current_restart_count > previous_restart_count && current_restart_count >= 3 {
|
||||||
|
let event_type = PodEventType::HighRestartCount {
|
||||||
|
count: current_restart_count,
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
restart_count = current_restart_count,
|
||||||
|
"Pod has high restart count"
|
||||||
|
);
|
||||||
|
self.restart_counts.insert(key.clone(), current_restart_count);
|
||||||
|
self.reported_issues.insert(key.clone(), event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.restart_counts.insert(key.clone(), current_restart_count);
|
||||||
|
|
||||||
|
// Check container states for specific errors
|
||||||
|
if let Some(container_statuses) = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.container_statuses.as_ref())
|
||||||
|
{
|
||||||
|
for cs in container_statuses {
|
||||||
|
if let Some(waiting) = cs.state.as_ref().and_then(|s| s.waiting.as_ref()) {
|
||||||
|
let reason = waiting.reason.as_deref().unwrap_or("");
|
||||||
|
|
||||||
|
match reason {
|
||||||
|
"CrashLoopBackOff" => {
|
||||||
|
let event_type = PodEventType::CrashLoopBackOff;
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
container = %cs.name,
|
||||||
|
"Container in CrashLoopBackOff"
|
||||||
|
);
|
||||||
|
self.reported_issues.insert(key.clone(), event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"ImagePullBackOff" | "ErrImagePull" => {
|
||||||
|
let event_type = PodEventType::ImagePullError;
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
container = %cs.name,
|
||||||
|
"Image pull error"
|
||||||
|
);
|
||||||
|
self.reported_issues.insert(key.clone(), event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"ContainerCreating" => {
|
||||||
|
// Check how long it's been creating
|
||||||
|
if let Some(start_time) = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.start_time.as_ref())
|
||||||
|
{
|
||||||
|
let duration = chrono::Utc::now()
|
||||||
|
.signed_duration_since(start_time.0)
|
||||||
|
.num_seconds();
|
||||||
|
|
||||||
|
if duration > 300 {
|
||||||
|
// 5 minutes
|
||||||
|
let event_type = PodEventType::ContainerCreating {
|
||||||
|
duration_seconds: duration,
|
||||||
|
};
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
duration_seconds = duration,
|
||||||
|
"Container creating for too long"
|
||||||
|
);
|
||||||
|
self.reported_issues.insert(key.clone(), event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check pod phase
|
||||||
|
match phase {
|
||||||
|
"Pending" => {
|
||||||
|
if let Some(reason) = Self::get_pending_reason(&pod) {
|
||||||
|
let event_type = PodEventType::Pending {
|
||||||
|
reason: Some(reason.clone()),
|
||||||
|
};
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
reason = %reason,
|
||||||
|
"Pod stuck in Pending"
|
||||||
|
);
|
||||||
|
self.reported_issues.insert(key.clone(), event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"Failed" => {
|
||||||
|
let reason = Self::get_failure_reason(&pod);
|
||||||
|
let event_type = PodEventType::Failed {
|
||||||
|
reason: reason.clone(),
|
||||||
|
};
|
||||||
|
if should_report(&event_type) {
|
||||||
|
warn!(
|
||||||
|
pod = %name,
|
||||||
|
namespace = %namespace,
|
||||||
|
reason = ?reason,
|
||||||
|
"Pod in Failed state"
|
||||||
|
);
|
||||||
|
self.reported_issues.insert(key, event_type.clone());
|
||||||
|
return Some(PodEvent {
|
||||||
|
pod_name: name,
|
||||||
|
namespace,
|
||||||
|
event_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pod_key(pod: &Pod) -> String {
|
||||||
|
format!(
|
||||||
|
"{}/{}",
|
||||||
|
pod.namespace().unwrap_or_else(|| "default".to_string()),
|
||||||
|
pod.name_any()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_restart_count(pod: &Pod) -> i32 {
|
||||||
|
pod.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.container_statuses.as_ref())
|
||||||
|
.map(|cs| cs.iter().map(|c| c.restart_count).sum())
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_pending_reason(pod: &Pod) -> Option<String> {
|
||||||
|
pod.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.and_then(|conditions| {
|
||||||
|
conditions
|
||||||
|
.iter()
|
||||||
|
.find(|c| c.type_ == "PodScheduled" && c.status == "False")
|
||||||
|
.and_then(|c| c.reason.clone())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_failure_reason(pod: &Pod) -> Option<String> {
|
||||||
|
pod.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.reason.clone())
|
||||||
|
.or_else(|| {
|
||||||
|
pod.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.message.clone())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
149
src/k8s/pods.rs
Normal file
149
src/k8s/pods.rs
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
use super::client::KubeClient;
|
||||||
|
use super::types::{ContainerStatus, PodCondition, PodDetails, PodLogs};
|
||||||
|
use k8s_openapi::api::core::v1::Pod;
|
||||||
|
use kube::{api::LogParams, Api};
|
||||||
|
|
||||||
|
impl KubeClient {
|
||||||
|
/// Gets detailed information about a specific pod for diagnostics
|
||||||
|
pub async fn get_pod_details(
|
||||||
|
&self,
|
||||||
|
namespace: &str,
|
||||||
|
pod_name: &str,
|
||||||
|
) -> Result<PodDetails, Box<dyn std::error::Error>> {
|
||||||
|
let pods: Api<Pod> = Api::namespaced(self.inner().clone(), namespace);
|
||||||
|
let pod = pods.get(pod_name).await?;
|
||||||
|
|
||||||
|
// Extract phase
|
||||||
|
let phase = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.phase.clone())
|
||||||
|
.unwrap_or_else(|| "Unknown".to_string());
|
||||||
|
|
||||||
|
// Extract node name
|
||||||
|
let node_name = pod.spec.as_ref().and_then(|s| s.node_name.clone());
|
||||||
|
|
||||||
|
// Extract conditions
|
||||||
|
let conditions = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.map(|conds| {
|
||||||
|
conds
|
||||||
|
.iter()
|
||||||
|
.map(|c| PodCondition {
|
||||||
|
type_: c.type_.clone(),
|
||||||
|
status: c.status.clone(),
|
||||||
|
reason: c.reason.clone(),
|
||||||
|
message: c.message.clone(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Extract container statuses
|
||||||
|
let container_statuses: Vec<ContainerStatus> = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.container_statuses.as_ref())
|
||||||
|
.map(|cs| {
|
||||||
|
cs.iter()
|
||||||
|
.map(|c| {
|
||||||
|
let (state, state_reason, state_message) = if let Some(waiting) =
|
||||||
|
&c.state.as_ref().and_then(|s| s.waiting.as_ref())
|
||||||
|
{
|
||||||
|
(
|
||||||
|
"Waiting".to_string(),
|
||||||
|
waiting.reason.clone(),
|
||||||
|
waiting.message.clone(),
|
||||||
|
)
|
||||||
|
} else if let Some(_running) =
|
||||||
|
&c.state.as_ref().and_then(|s| s.running.as_ref())
|
||||||
|
{
|
||||||
|
("Running".to_string(), None, None)
|
||||||
|
} else if let Some(terminated) =
|
||||||
|
&c.state.as_ref().and_then(|s| s.terminated.as_ref())
|
||||||
|
{
|
||||||
|
(
|
||||||
|
"Terminated".to_string(),
|
||||||
|
terminated.reason.clone(),
|
||||||
|
terminated.message.clone(),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
("Unknown".to_string(), None, None)
|
||||||
|
};
|
||||||
|
|
||||||
|
ContainerStatus {
|
||||||
|
name: c.name.clone(),
|
||||||
|
ready: c.ready,
|
||||||
|
restart_count: c.restart_count,
|
||||||
|
state,
|
||||||
|
state_reason,
|
||||||
|
state_message,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Calculate total restart count
|
||||||
|
let restart_count: i32 = container_statuses.iter().map(|c| c.restart_count).sum();
|
||||||
|
|
||||||
|
// Extract start time
|
||||||
|
let start_time = pod
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.start_time.as_ref())
|
||||||
|
.map(|t| t.0.to_rfc3339());
|
||||||
|
|
||||||
|
Ok(PodDetails {
|
||||||
|
name: pod_name.to_string(),
|
||||||
|
namespace: namespace.to_string(),
|
||||||
|
phase,
|
||||||
|
node_name,
|
||||||
|
conditions,
|
||||||
|
container_statuses,
|
||||||
|
restart_count,
|
||||||
|
start_time,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets recent logs from a pod's container
|
||||||
|
pub async fn get_pod_logs(
|
||||||
|
&self,
|
||||||
|
namespace: &str,
|
||||||
|
pod_name: &str,
|
||||||
|
container_name: Option<&str>,
|
||||||
|
tail_lines: Option<i64>,
|
||||||
|
) -> Result<PodLogs, Box<dyn std::error::Error>> {
|
||||||
|
let pods: Api<Pod> = Api::namespaced(self.inner().clone(), namespace);
|
||||||
|
|
||||||
|
// Determine which container to get logs from
|
||||||
|
let container = if let Some(name) = container_name {
|
||||||
|
name.to_string()
|
||||||
|
} else {
|
||||||
|
// Get first container name
|
||||||
|
let pod = pods.get(pod_name).await?;
|
||||||
|
pod.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.containers.first())
|
||||||
|
.map(|c| c.name.clone())
|
||||||
|
.ok_or("No containers found in pod")?
|
||||||
|
};
|
||||||
|
|
||||||
|
let log_params = LogParams {
|
||||||
|
container: Some(container.clone()),
|
||||||
|
tail_lines,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let logs = pods.logs(pod_name, &log_params).await?;
|
||||||
|
|
||||||
|
Ok(PodLogs {
|
||||||
|
pod_name: pod_name.to_string(),
|
||||||
|
namespace: namespace.to_string(),
|
||||||
|
container_name: container,
|
||||||
|
logs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
76
src/k8s/types.rs
Normal file
76
src/k8s/types.rs
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
// Node types
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct NodeStatus {
|
||||||
|
pub name: String,
|
||||||
|
pub ready: bool,
|
||||||
|
pub version: String,
|
||||||
|
pub internal_ip: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct NodeCondition {
|
||||||
|
pub type_: String,
|
||||||
|
pub status: String,
|
||||||
|
pub reason: Option<String>,
|
||||||
|
pub message: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct NodeResources {
|
||||||
|
pub cpu: String,
|
||||||
|
pub memory: String,
|
||||||
|
pub pods: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct NodeDetails {
|
||||||
|
pub name: String,
|
||||||
|
pub conditions: Vec<NodeCondition>,
|
||||||
|
pub capacity: NodeResources,
|
||||||
|
pub allocatable: NodeResources,
|
||||||
|
pub taints: Vec<String>,
|
||||||
|
pub labels: Vec<(String, String)>,
|
||||||
|
pub pod_count: usize,
|
||||||
|
pub version: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pod types
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct PodCondition {
|
||||||
|
pub type_: String,
|
||||||
|
pub status: String,
|
||||||
|
pub reason: Option<String>,
|
||||||
|
pub message: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct ContainerStatus {
|
||||||
|
pub name: String,
|
||||||
|
pub ready: bool,
|
||||||
|
pub restart_count: i32,
|
||||||
|
pub state: String,
|
||||||
|
pub state_reason: Option<String>,
|
||||||
|
pub state_message: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct PodDetails {
|
||||||
|
pub name: String,
|
||||||
|
pub namespace: String,
|
||||||
|
pub phase: String,
|
||||||
|
pub node_name: Option<String>,
|
||||||
|
pub conditions: Vec<PodCondition>,
|
||||||
|
pub container_statuses: Vec<ContainerStatus>,
|
||||||
|
pub restart_count: i32,
|
||||||
|
pub start_time: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct PodLogs {
|
||||||
|
pub pod_name: String,
|
||||||
|
pub namespace: String,
|
||||||
|
pub container_name: String,
|
||||||
|
pub logs: String,
|
||||||
|
}
|
||||||
170
src/k8s/watcher.rs
Normal file
170
src/k8s/watcher.rs
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
use futures::TryStreamExt;
|
||||||
|
use k8s_openapi::api::core::v1::Node;
|
||||||
|
use kube::{
|
||||||
|
runtime::{watcher, watcher::Config, WatchStreamExt},
|
||||||
|
Api, ResourceExt,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
|
use super::client::KubeClient;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct NodeEvent {
|
||||||
|
pub node_name: String,
|
||||||
|
pub ready: bool,
|
||||||
|
pub event_type: NodeEventType,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum NodeEventType {
|
||||||
|
BecameNotReady,
|
||||||
|
BecameReady,
|
||||||
|
ConditionChanged { condition_type: String, status: String, reason: Option<String> },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct NodeWatcher {
|
||||||
|
api: Api<Node>,
|
||||||
|
previous_states: HashMap<String, bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NodeWatcher {
|
||||||
|
pub fn new(kube_client: &KubeClient) -> Self {
|
||||||
|
let api = Api::all(kube_client.inner().clone());
|
||||||
|
Self {
|
||||||
|
api,
|
||||||
|
previous_states: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start watching node events
|
||||||
|
pub async fn watch(
|
||||||
|
mut self,
|
||||||
|
) -> Result<
|
||||||
|
impl futures::Stream<Item = Result<NodeEvent, Box<dyn std::error::Error + Send + Sync>>>,
|
||||||
|
Box<dyn std::error::Error>,
|
||||||
|
> {
|
||||||
|
info!("Starting node watcher");
|
||||||
|
|
||||||
|
// Initialize current state
|
||||||
|
let nodes = self.api.list(&Default::default()).await?;
|
||||||
|
for node in nodes.items {
|
||||||
|
let name = node.name_any();
|
||||||
|
let ready = Self::is_node_ready(&node);
|
||||||
|
self.previous_states.insert(name.clone(), ready);
|
||||||
|
debug!("Initial state: {} = Ready:{}", name, ready);
|
||||||
|
}
|
||||||
|
|
||||||
|
let stream = watcher(self.api.clone(), Config::default())
|
||||||
|
.applied_objects()
|
||||||
|
.try_filter_map(move |node| {
|
||||||
|
let event = self.process_node_event(node);
|
||||||
|
futures::future::ready(Ok(event))
|
||||||
|
})
|
||||||
|
.map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>);
|
||||||
|
|
||||||
|
Ok(stream)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process_node_event(&mut self, node: Node) -> Option<NodeEvent> {
|
||||||
|
let name = node.name_any();
|
||||||
|
let current_ready = Self::is_node_ready(&node);
|
||||||
|
let previous_ready = self.previous_states.get(&name).copied();
|
||||||
|
|
||||||
|
let event = match (previous_ready, current_ready) {
|
||||||
|
// Node became NotReady
|
||||||
|
(Some(true), false) | (None, false) => {
|
||||||
|
warn!("Node {} became NotReady", name);
|
||||||
|
Some(NodeEvent {
|
||||||
|
node_name: name.clone(),
|
||||||
|
ready: false,
|
||||||
|
event_type: NodeEventType::BecameNotReady,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
// Node became Ready
|
||||||
|
(Some(false), true) => {
|
||||||
|
info!("Node {} became Ready", name);
|
||||||
|
Some(NodeEvent {
|
||||||
|
node_name: name.clone(),
|
||||||
|
ready: true,
|
||||||
|
event_type: NodeEventType::BecameReady,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
// Check for condition changes even if Ready state is the same
|
||||||
|
_ => {
|
||||||
|
if let Some(problematic_condition) = Self::find_problematic_condition(&node) {
|
||||||
|
warn!(
|
||||||
|
"Node {} has problematic condition: {}",
|
||||||
|
name, problematic_condition.condition_type
|
||||||
|
);
|
||||||
|
Some(NodeEvent {
|
||||||
|
node_name: name.clone(),
|
||||||
|
ready: current_ready,
|
||||||
|
event_type: NodeEventType::ConditionChanged {
|
||||||
|
condition_type: problematic_condition.condition_type,
|
||||||
|
status: problematic_condition.status,
|
||||||
|
reason: problematic_condition.reason,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update state
|
||||||
|
self.previous_states.insert(name, current_ready);
|
||||||
|
event
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_node_ready(node: &Node) -> bool {
|
||||||
|
node.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.and_then(|conditions| {
|
||||||
|
conditions
|
||||||
|
.iter()
|
||||||
|
.find(|c| c.type_ == "Ready")
|
||||||
|
.map(|c| c.status == "True")
|
||||||
|
})
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_problematic_condition(node: &Node) -> Option<ProblematicCondition> {
|
||||||
|
node.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.and_then(|conditions| {
|
||||||
|
for condition in conditions {
|
||||||
|
// Check for problematic conditions
|
||||||
|
match condition.type_.as_str() {
|
||||||
|
"Ready" if condition.status != "True" => {
|
||||||
|
return Some(ProblematicCondition {
|
||||||
|
condition_type: condition.type_.clone(),
|
||||||
|
status: condition.status.clone(),
|
||||||
|
reason: condition.reason.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
"MemoryPressure" | "DiskPressure" | "PIDPressure"
|
||||||
|
if condition.status == "True" =>
|
||||||
|
{
|
||||||
|
return Some(ProblematicCondition {
|
||||||
|
condition_type: condition.type_.clone(),
|
||||||
|
status: condition.status.clone(),
|
||||||
|
reason: condition.reason.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct ProblematicCondition {
|
||||||
|
condition_type: String,
|
||||||
|
status: String,
|
||||||
|
reason: Option<String>,
|
||||||
|
}
|
||||||
123
src/main.rs
Normal file
123
src/main.rs
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
mod ai;
|
||||||
|
mod config;
|
||||||
|
mod events;
|
||||||
|
mod k8s;
|
||||||
|
mod tools;
|
||||||
|
|
||||||
|
use ai::AIClient;
|
||||||
|
use config::Config;
|
||||||
|
use events::EventHandler;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use k8s::{KubeClient, NodeWatcher, PodWatcher};
|
||||||
|
use tracing::{error, info};
|
||||||
|
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
// Initialize logging
|
||||||
|
tracing_subscriber::registry()
|
||||||
|
.with(fmt::layer())
|
||||||
|
.with(EnvFilter::from_default_env().add_directive(tracing::Level::INFO.into()))
|
||||||
|
.init();
|
||||||
|
|
||||||
|
info!("Starting duty-ai-ops daemon");
|
||||||
|
|
||||||
|
// Load configuration
|
||||||
|
let cfg = Config::load("config.toml")?;
|
||||||
|
info!("Configuration loaded");
|
||||||
|
|
||||||
|
// Initialize clients
|
||||||
|
let kube_client = KubeClient::new().await?;
|
||||||
|
info!("Connected to Kubernetes cluster");
|
||||||
|
|
||||||
|
let ai_client = AIClient::new(&cfg);
|
||||||
|
info!(
|
||||||
|
model = %cfg.model,
|
||||||
|
max_concurrent = cfg.max_concurrent_diagnoses,
|
||||||
|
"AI client initialized"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create event handler with concurrency limit
|
||||||
|
let event_handler = EventHandler::new(
|
||||||
|
kube_client.clone(),
|
||||||
|
ai_client,
|
||||||
|
cfg.max_concurrent_diagnoses,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Start node watcher
|
||||||
|
let node_watcher = NodeWatcher::new(&kube_client);
|
||||||
|
let node_stream = node_watcher.watch().await?;
|
||||||
|
tokio::pin!(node_stream);
|
||||||
|
|
||||||
|
// Start pod watcher
|
||||||
|
let pod_watcher = PodWatcher::new(&kube_client);
|
||||||
|
let pod_stream = pod_watcher.watch().await?;
|
||||||
|
tokio::pin!(pod_stream);
|
||||||
|
|
||||||
|
info!("Node and Pod watchers started, monitoring for events");
|
||||||
|
|
||||||
|
// Setup graceful shutdown
|
||||||
|
let ctrl_c = tokio::signal::ctrl_c();
|
||||||
|
tokio::pin!(ctrl_c);
|
||||||
|
|
||||||
|
// Event loop
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
// Handle shutdown signal
|
||||||
|
_ = &mut ctrl_c => {
|
||||||
|
info!("Received shutdown signal, exiting gracefully");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle node events
|
||||||
|
Some(event_result) = node_stream.next() => {
|
||||||
|
match event_result {
|
||||||
|
Ok(event) => {
|
||||||
|
let handler = event_handler.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
handler.handle_node_event(event).await;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(error = %e, "Error processing node watch event");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle pod events
|
||||||
|
Some(event_result) = pod_stream.next() => {
|
||||||
|
match event_result {
|
||||||
|
Ok(event) => {
|
||||||
|
let handler = event_handler.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
handler.handle_pod_event(event).await;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(error = %e, "Error processing pod watch event");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If streams end unexpectedly
|
||||||
|
else => {
|
||||||
|
error!("Watch streams ended unexpectedly, restarting...");
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||||
|
|
||||||
|
// Recreate watchers and streams
|
||||||
|
let node_watcher = NodeWatcher::new(&kube_client);
|
||||||
|
let new_node_stream = node_watcher.watch().await?;
|
||||||
|
node_stream.set(new_node_stream);
|
||||||
|
|
||||||
|
let pod_watcher = PodWatcher::new(&kube_client);
|
||||||
|
let new_pod_stream = pod_watcher.watch().await?;
|
||||||
|
pod_stream.set(new_pod_stream);
|
||||||
|
|
||||||
|
info!("Watchers restarted");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Daemon stopped");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
5
src/tools/mod.rs
Normal file
5
src/tools/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
mod node_details;
|
||||||
|
mod pod_tools;
|
||||||
|
|
||||||
|
pub use node_details::get_node_details_tool;
|
||||||
|
pub use pod_tools::{get_pod_details_tool, get_pod_logs_tool};
|
||||||
25
src/tools/node_details.rs
Normal file
25
src/tools/node_details.rs
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
use async_openai::types::chat::{ChatCompletionTool, FunctionObject};
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
/// Creates tool definition for getting detailed node information
|
||||||
|
pub fn get_node_details_tool() -> ChatCompletionTool {
|
||||||
|
ChatCompletionTool {
|
||||||
|
function: FunctionObject {
|
||||||
|
name: "get_node_details".to_string(),
|
||||||
|
description: Some(
|
||||||
|
"Get detailed diagnostic information about a specific Kubernetes node including conditions, resources, taints, labels, and pod count".to_string(),
|
||||||
|
),
|
||||||
|
parameters: Some(json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"node_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The name of the node to get details for"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["node_name"]
|
||||||
|
})),
|
||||||
|
strict: None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
64
src/tools/pod_tools.rs
Normal file
64
src/tools/pod_tools.rs
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
use async_openai::types::chat::{ChatCompletionTool, FunctionObject};
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
/// Creates tool definition for getting detailed pod information
|
||||||
|
pub fn get_pod_details_tool() -> ChatCompletionTool {
|
||||||
|
ChatCompletionTool {
|
||||||
|
function: FunctionObject {
|
||||||
|
name: "get_pod_details".to_string(),
|
||||||
|
description: Some(
|
||||||
|
"Get detailed diagnostic information about a specific Kubernetes pod including phase, conditions, container statuses, and restart counts".to_string(),
|
||||||
|
),
|
||||||
|
parameters: Some(json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"namespace": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The namespace of the pod"
|
||||||
|
},
|
||||||
|
"pod_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The name of the pod to get details for"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["namespace", "pod_name"]
|
||||||
|
})),
|
||||||
|
strict: None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates tool definition for getting pod logs
|
||||||
|
pub fn get_pod_logs_tool() -> ChatCompletionTool {
|
||||||
|
ChatCompletionTool {
|
||||||
|
function: FunctionObject {
|
||||||
|
name: "get_pod_logs".to_string(),
|
||||||
|
description: Some(
|
||||||
|
"Get recent logs from a pod's container to diagnose runtime issues and errors".to_string(),
|
||||||
|
),
|
||||||
|
parameters: Some(json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"namespace": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The namespace of the pod"
|
||||||
|
},
|
||||||
|
"pod_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The name of the pod to get logs from"
|
||||||
|
},
|
||||||
|
"container_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Optional: specific container name. If not provided, gets logs from first container"
|
||||||
|
},
|
||||||
|
"tail_lines": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of recent log lines to retrieve. Default is 50 lines"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["namespace", "pod_name"]
|
||||||
|
})),
|
||||||
|
strict: None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user