feat(agent)!: add support for mTLS authentication in gRPC server (#54)

* refactor(fancontroller): improve fan controller validation logic and error handling for temperature steps

* refactor(agent): restructure gRPC server implementation by moving it to a new api package for better organization and maintainability

* feat(agent): implement gRPC server for managing compute blade agents and add graceful shutdown support
refactor(agent): restructure agent code by moving API logic to a dedicated file and improving error handling
fix(agent): update logging messages for clarity and consistency across the agent's operations
chore(agent): remove unused API code and consolidate event handling logic for better maintainability
style(agent): improve code formatting and organization for better readability and adherence to conventions

* feat(agent): add support for TLS configuration in gRPC server

* feat(api): add gRPC server authentication

* fix

* feat(config): add listen mode configuration to support tcp or unix sockets
feat(agent): implement listen mode in gRPC service to allow flexible socket types
feat(bladectl): enhance configuration loading and add support for TLS credentials
fix(bladectl): improve error handling for gRPC connection and event emission
style(logging): change log level from Warn to Info for better clarity in logs

* add logging middleware + fixes

* fix remote-connection to gRPC API Server

debugging the SAN issues took the soul out of me... And then the stupid
mistake in cmd_root where I didn't construct the TLS credentials
correctly... Oh dear...

* cleanup

* cleanup

* cleanup commands

* cleanup

* make README.md nicer

* Update cmd/agent/main.go

Co-authored-by: Matthias Riegler <github@m4tbit.de>

* Update cmd/bladectl/cmd_root.go

Co-authored-by: Matthias Riegler <github@m4tbit.de>

* move bladectl config into correct directory

* fix bugs

* // FIXME: No dead code

* nit: code style

* nit(YAGNI): you aint gonna need it. Don't make life harder than it needs to be

* nit(YAGNI): you aint gonna need it. Don't make life harder than it needs to be

* nit(YAGNI): you aint gonna need it. Don't make life harder than it needs to be

* nit(cmd_identify)

---------

Co-authored-by: Matthias Riegler <github@m4tbit.de>
This commit is contained in:
Cedric Kienzler
2025-05-12 00:00:55 +02:00
committed by GitHub
parent ec6229ad86
commit 70541d86ba
60 changed files with 2189 additions and 650 deletions

View File

@@ -1,16 +1,15 @@
# Default configuration for the compute-blade-agent
log:
mode: production # production, development
# Listen configuration
listen:
metrics: ":9666"
grpc: /tmp/compute-blade-agent.sock
authenticated: false
mode: unix # tcp or unix
# Hardware abstraction layer configuration
hal:
# For the default fan unit, fanspeed measurement is causing a tiny bit of CPU laod.
# For the default fan unit, fanspeed measurement is causing a tiny bit of CPU load.
# Sometimes it might not be desired
rpm_reporting_standard_fan_unit: true
@@ -35,14 +34,13 @@ criticalLedColor:
# Enable/disable stealth mode; turns off all LEDs on the blade
stealth_mode: false
# Simple fan-speed controls based on the SoC temperature
fan_controller:
# For now, this is only supporting a two-step configuration.
steps:
- temperature: 45
percent: 40
- temperature: 55
percent: 80
# Critical temperature threshold
critical_temperature_threshold: 60

View File

@@ -2,8 +2,8 @@ package main
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"net/http/pprof"
"os"
@@ -14,12 +14,12 @@ import (
"time"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/spf13/pflag"
"github.com/spf13/viper"
bladeapiv1alpha1 "github.com/uptime-induestries/compute-blade-agent/api/bladeapi/v1alpha1"
"github.com/uptime-induestries/compute-blade-agent/internal/agent"
"github.com/uptime-induestries/compute-blade-agent/pkg/log"
"github.com/uptime-industries/compute-blade-agent/internal/agent"
"github.com/uptime-industries/compute-blade-agent/internal/api"
"github.com/uptime-industries/compute-blade-agent/pkg/log"
"go.uber.org/zap"
"google.golang.org/grpc"
)
var (
@@ -28,13 +28,12 @@ var (
Date string
)
var debug = pflag.BoolP("debug", "v", false, "enable verbose logging")
func main() {
var wg sync.WaitGroup
pflag.Parse()
// Setup configuration
viper.SetConfigType("yaml")
// auto-bind environment variables
viper.SetEnvPrefix("BLADE")
viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
viper.AutomaticEnv()
@@ -49,13 +48,11 @@ func main() {
// setup logger
var baseLogger *zap.Logger
switch logMode := viper.GetString("log.mode"); logMode {
case "development":
if debug != nil && *debug {
baseLogger = zap.Must(zap.NewDevelopment())
case "production":
} else {
baseLogger = zap.Must(zap.NewProduction())
default:
panic(fmt.Errorf("invalid log.mode: %s", logMode))
}
zapLogger := baseLogger.With(zap.String("app", "compute-blade-agent"))
@@ -71,73 +68,96 @@ func main() {
// load configuration
var cbAgentConfig agent.ComputeBladeAgentConfig
if err := viper.Unmarshal(&cbAgentConfig); err != nil {
log.FromContext(ctx).Error("Failed to load configuration", zap.Error(err))
cancelCtx(err)
log.FromContext(ctx).Fatal("Failed to load configuration", zap.Error(err))
}
// setup stop signal handlers
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
wg.Add(1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
go func() {
defer wg.Done()
// Wait for context cancel or signal
select {
// Wait for context cancel
case <-ctx.Done():
// Wait for signal
case sig := <-sigs:
// On signal, cancel context
cancelCtx(fmt.Errorf("signal %s received", sig))
switch sig {
case syscall.SIGTERM:
fallthrough
case syscall.SIGINT:
fallthrough
case syscall.SIGQUIT:
// On terminate signal, cancel context causing the program to terminate
cancelCtx(fmt.Errorf("signal %s received", sig))
default:
log.FromContext(ctx).Warn("Received unknown signal", zap.String("signal", sig.String()))
}
}
}()
log.FromContext(ctx).Info("Bootstrapping compute-blade-agent", zap.String("version", Version), zap.String("commit", Commit), zap.String("date", Date))
computebladeAgent, err := agent.NewComputeBladeAgent(ctx, cbAgentConfig)
if err != nil {
log.FromContext(ctx).Error("Failed to create agent", zap.Error(err))
cancelCtx(err)
os.Exit(1)
log.FromContext(ctx).Fatal("Failed to create agent", zap.Error(err))
}
// Run agent
wg.Add(1)
go func() {
defer wg.Done()
log.FromContext(ctx).Info("Starting agent")
err := computebladeAgent.Run(ctx)
if err != nil && err != context.Canceled {
log.FromContext(ctx).Error("Failed to run agent", zap.Error(err))
cancelCtx(err)
}
}()
computebladeAgent.RunAsync(ctx, cancelCtx)
// Setup GRPC server
// FIXME add logging middleware
grpcServer := grpc.NewServer()
bladeapiv1alpha1.RegisterBladeAgentServiceServer(grpcServer, agent.NewGrpcServiceFor(computebladeAgent))
grpcServer := api.NewGrpcApiServer(ctx,
api.WithComputeBladeAgent(computebladeAgent),
api.WithAuthentication(cbAgentConfig.Listen.GrpcAuthenticated),
api.WithListenAddr(cbAgentConfig.Listen.Grpc),
api.WithListenMode(cbAgentConfig.Listen.GrpcListenMode),
)
// Run gRPC API
grpcServer.ServeAsync(ctx, cancelCtx)
// setup prometheus endpoint
promServer := runPrometheusEndpoint(ctx, cancelCtx, &cbAgentConfig.Listen)
// Wait for done
<-ctx.Done()
var wg sync.WaitGroup
// Shut-Down GRPC Server
wg.Add(1)
go func() {
defer wg.Done()
grpcListen, err := net.Listen("unix", viper.GetString("listen.grpc"))
if err != nil {
log.FromContext(ctx).Error("Failed to create grpc listener", zap.Error(err))
cancelCtx(err)
return
}
log.FromContext(ctx).Info("Starting grpc server", zap.String("address", viper.GetString("listen.grpc")))
if err := grpcServer.Serve(grpcListen); err != nil && err != grpc.ErrServerStopped {
log.FromContext(ctx).Error("Failed to start grpc server", zap.Error(err))
cancelCtx(err)
}
}()
wg.Add(1)
go func() {
defer wg.Done()
<-ctx.Done()
log.FromContext(ctx).Info("Shutting down grpc server")
grpcServer.GracefulStop()
}()
// setup prometheus endpoint
// Shut-Down Prometheus Endpoint
wg.Add(1)
go func() {
defer wg.Done()
shutdownCtx, shutdownCtxCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCtxCancel()
if err := promServer.Shutdown(shutdownCtx); err != nil {
log.FromContext(ctx).Error("Failed to shutdown prometheus/pprof server", zap.Error(err))
}
}()
wg.Wait()
// Wait for context cancel
if err := ctx.Err(); !errors.Is(err, context.Canceled) {
log.FromContext(ctx).Fatal("Exiting", zap.Error(err))
} else {
log.FromContext(ctx).Info("Exiting")
}
}
func runPrometheusEndpoint(ctx context.Context, cancel context.CancelCauseFunc, apiConfig *api.Config) *http.Server {
instrumentationHandler := http.NewServeMux()
instrumentationHandler.Handle("/metrics", promhttp.Handler())
instrumentationHandler.HandleFunc("/debug/pprof/", pprof.Index)
@@ -145,33 +165,17 @@ func main() {
instrumentationHandler.HandleFunc("/debug/pprof/profile", pprof.Profile)
instrumentationHandler.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
instrumentationHandler.HandleFunc("/debug/pprof/trace", pprof.Trace)
server := &http.Server{Addr: ":9666", Handler: instrumentationHandler}
wg.Add(1)
server := &http.Server{Addr: apiConfig.Metrics, Handler: instrumentationHandler}
// Run Prometheus Endpoint
go func() {
defer wg.Done()
err := server.ListenAndServe()
if err != nil && err != http.ErrServerClosed {
if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.FromContext(ctx).Error("Failed to start prometheus/pprof server", zap.Error(err))
cancelCtx(err)
}
}()
wg.Add(1)
go func() {
defer wg.Done()
<-ctx.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
err := server.Shutdown(shutdownCtx)
if err != nil {
log.FromContext(ctx).Error("Failed to shutdown prometheus/pprof server", zap.Error(err))
cancel(err)
}
}()
// Wait for context cancel
wg.Wait()
if err := ctx.Err(); err != nil && err != context.Canceled {
log.FromContext(ctx).Fatal("Exiting", zap.Error(err))
} else {
log.FromContext(ctx).Info("Exiting")
}
return server
}