Nomad Platform Deployment
Complete guide for deploying Productify on HashiCorp Nomad with detailed setup instructions and monitoring.
Prerequisites
Required Software
- Nomad: 1.6+ cluster (server and client nodes)
- PostgreSQL: 16+ database
- Docker: Runtime on all Nomad client nodes
System Requirements
- Memory: 8 GB RAM minimum per node (16 GB recommended)
- CPU: 4 cores minimum per node
- Storage: 50 GB free disk space per node
- Network: Low latency between nodes (< 10ms for same datacenter)
Nomad Installation
Linux (AMD64)
# Download Nomad
wget https://releases.hashicorp.com/nomad/1.7.3/nomad_1.7.3_linux_amd64.zip
# Extract
unzip nomad_1.7.3_linux_amd64.zip
# Move to system path
sudo mv nomad /usr/local/bin/
# Verify installation
nomad versionmacOS (Homebrew)
# Install Nomad
brew install nomad
# Verify installation
nomad versionOther Platforms
See the official Nomad installation guide for other platforms and installation methods.
Nomad Agent Setup
Development Mode (Single Node)
For local testing and development:
# Start Nomad in development mode (server + client)
sudo nomad agent -dev
# In another terminal, verify
nomad node status
nomad server members
# Access Web UI
open http://localhost:4646Important Nomad Ports:
4646: HTTP API and Web UI4647: RPC (Remote Procedure Call)4648: Serf WAN (gossip protocol)
Production Cluster
For production, separate server and client nodes are recommended.
Server Node:
sudo nomad agent -config=/etc/nomad.d/server.hclserver.hcl:
datacenter = "dc1"
data_dir = "/var/lib/nomad"
server {
enabled = true
bootstrap_expect = 3
}Client Node:
sudo nomad agent -config=/etc/nomad.d/client.hclclient.hcl:
datacenter = "dc1"
data_dir = "/var/lib/nomad"
client {
enabled = true
}
plugin "docker" {
config {
allow_privileged = false
}
}Prerequisites
- Nomad 1.6+ cluster
- PostgreSQL 16+ database
- Docker runtime on Nomad clients
Complete Job Specification
Manager Job
job "manager" {
datacenters = ["dc1"]
type = "service"
group "api" {
count = 3
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "3m"
auto_revert = true
}
network {
port "http" {
to = 8080
}
}
service {
name = "manager-api"
port = "http"
tags = ["productify", "api"]
check {
type = "http"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
task "server" {
driver = "docker"
config {
image = "ghcr.io/productifyfw/manager:${VERSION}"
ports = ["http"]
}
env {
PFY_RUN_MODE = "api"
}
template {
data = <<EOH
PFY_DB_HOST={{ key "manager/db/host" }}
PFY_DB_PORT={{ key "manager/db/port" }}
PFY_DB_USER={{ key "manager/db/user" }}
PFY_DB_PASSWORD={{ key "manager/db/password" }}
PFY_DB_NAME={{ key "manager/db/name" }}
PFY_DB_SSLMODE=require
PFY_POCKET_ID_HOST={{ key "manager/pocketid/host" }}
PFY_POCKET_ID_API_KEY={{ key "manager/pocketid/api_key" }}
PFY_ENV=production
PFY_RUN_MODE=api
PFY_PORT=${NOMAD_PORT_http}
EOH
destination = "secrets/env"
env = true
}
resources {
cpu = 500
memory = 512
}
}
}
group "executor" {
count = 1
constraint {
operator = "distinct_hosts"
value = "true"
}
network {
port "metrics" {
to = 9090
}
}
service {
name = "manager-executor"
port = "metrics"
tags = ["productify", "executor"]
}
task "executor" {
driver = "docker"
config {
image = "ghcr.io/productifyfw/manager:${VERSION}"
}
env {
MODE = "executor"
}
template {
data = <<EOH
DB_HOST={{ key "manager/db/host" }}
DB_PORT={{ key "manager/db/port" }}
DB_USER={{ key "manager/db/user" }}
DB_PASSWORD={{ key "manager/db/password" }}
DB_NAME={{ key "manager/db/name" }}
EOH
destination = "secrets/env"
env = true
}
resources {
cpu = 200
memory = 256
}
}
}
}Optimizer Job
job "optimizer" {
datacenters = ["dc1"]
type = "service"
group "optimizer" {
count = 2
network {
port "http" {
to = 8000
}
port "metrics" {
to = 9090
}
}
service {
name = "optimizer"
port = "http"
tags = ["productify", "autoscaler"]
check {
type = "http"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
task "server" {
driver = "docker"
config {
image = "ghcr.io/productifyfw/optimizer:${VERSION}"
ports = ["http", "metrics"]
}
env {
CACHE_SIZE = "10"
FORECAST_HORIZON = "60"
LOG_LEVEL = "INFO"
}
resources {
cpu = 1000
memory = 1024
}
}
}
}Proxy Job
job "proxy" {
datacenters = ["dc1"]
type = "system" # Deploy on all nodes
group "caddy" {
network {
port "http" {
static = 80
to = 80
}
port "https" {
static = 443
to = 443
}
port "admin" {
to = 2019
}
}
service {
name = "proxy"
port = "http"
tags = ["productify", "proxy"]
check {
type = "http"
port = "admin"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
task "caddy" {
driver = "docker"
config {
image = "caddy:2.7-alpine"
ports = ["http", "https", "admin"]
volumes = [
"local/Caddyfile:/etc/caddy/Caddyfile"
]
}
template {
data = <<EOH
{
admin :2019
auto_https on
email {{ key "proxy/acme_email" }}
}
*.{{ key "proxy/domain" }} {
tls {
dns cloudflare {{ key "proxy/cloudflare_token" }}
}
@manager host manager.{{ key "proxy/domain" }}
handle @manager {
reverse_proxy {
{{- range service "manager-api" }}
to {{ .Address }}:{{ .Port }}
{{- end }}
lb_policy least_conn
health_uri /health
health_interval 10s
}
}
@tenant host *.{{ key "proxy/domain" }}
handle @tenant {
reverse_proxy {
{{- range service "tenant-proxy" }}
to {{ .Address }}:{{ .Port }}
{{- end }}
}
}
}
EOH
destination = "local/Caddyfile"
change_mode = "signal"
change_signal = "SIGHUP"
}
resources {
cpu = 200
memory = 128
}
}
}
}Autoscaler Plugin Job
job "nomad-autoscaler" {
datacenters = ["dc1"]
type = "service"
group "autoscaler" {
count = 1
network {
port "http" {
to = 8080
}
port "prometheus" {
to = 8081
}
}
service {
name = "nomad-autoscaler"
port = "http"
check {
type = "http"
path = "/v1/health"
interval = "10s"
timeout = "2s"
}
}
task "autoscaler" {
driver = "docker"
config {
image = "hashicorp/nomad-autoscaler:0.4.0"
volumes = [
"local/config.hcl:/etc/autoscaler.hcl",
"local/policies:/policies"
]
args = ["agent", "-config=/etc/autoscaler.hcl"]
}
# Autoscaler configuration
template {
data = <<EOH
plugin_dir = "/plugins"
nomad {
address = "http://{{ env "NOMAD_IP_http" }}:4646"
}
policy_dir = "/policies"
http {
bind_address = "0.0.0.0"
bind_port = 8080
}
telemetry {
prometheus_metrics = true
prometheus_retention_time = "24h"
}
EOH
destination = "local/config.hcl"
}
# Scaling policy for Manager API
template {
data = <<EOH
scaling "horizontal" {
enabled = true
min = 2
max = 10
policy {
evaluation_interval = "10s"
cooldown = "30s"
target "nomadscaler" {
namespace = "default"
job = "manager"
group = "api"
optimizer_url = "http://{{- with service "optimizer" }}{{ with index . 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}"
cache_size = 10
}
check "cpu" {
source = "nomad-apm"
query = "avg_cpu_percent"
strategy "target-value" {
target = 75
}
}
check "memory" {
source = "nomad-apm"
query = "avg_memory_percent"
strategy "target-value" {
target = 80
}
}
}
}
EOH
destination = "local/policies/manager-api.hcl"
}
# Copy plugin binary
artifact {
source = "https://releases.productify.io/nomadscaler/latest/nomadscaler"
destination = "plugins/nomadscaler"
mode = "file"
}
resources {
cpu = CPU_LIMIT
memory = MEMORY_LIMIT
}
}
}
}Monitoring
Monitor Optimizer and Nomad Autoscaler metrics using Prometheus:
# prometheus.yml
scrape_configs:
- job_name: "optimizer"
static_configs:
- targets: ["optimizer:8000"]
- job_name: "nomad-autoscaler"
static_configs:
- targets: ["nomad-autoscaler:8080"]
- job_name: "nomad"
static_configs:
- targets: ["localhost:4646"]Deployment Commands
Deploy Manager Component
cd manager/nomad
# Validate job specification
nomad job validate manager.nomad.hcl
# Plan the deployment (dry run)
nomad job plan manager.nomad.hcl
# Deploy the job
nomad job run manager.nomad.hcl
# Check job status
nomad job status manager
# View allocations (running instances)
nomad alloc status <alloc-id>
# Follow logs for a specific allocation
nomad alloc logs -f <alloc-id> serverManager Job Components:
- API Group: 3 instances (horizontally scaled)
- Serves GraphQL API
- Handles HTTP requests
- Registers with Nomad service discovery
- Executor Group: 1 instance
- Runs scheduled jobs and triggers
- Processes background tasks
Deploy Proxy and Authentication
cd proxy/nomad
# Deploy Pocket ID first
nomad job validate pocketid.nomad.hcl
nomad job run pocketid.nomad.hcl
# Wait for Pocket ID to be healthy
nomad job status pocketid
# Then deploy proxy
nomad job validate proxy.nomad.hcl
nomad job run proxy.nomad.hcl
# Check status
nomad job status proxy
# View service endpoints
nomad service listDeploy Autoscaler Stack
cd autoscaler/nomadscaler/config
# Deploy autoscaler with optimizer and Prometheus
nomad job validate autoscaler.hcl
nomad job run autoscaler.hcl
# Check status
nomad job status autoscaler
# Access Prometheus UI
open http://localhost:9090
# Verify metrics collection
curl http://localhost:9090/api/v1/targetsAutoscaler Components:
- Nomad Autoscaler Plugin: Evaluates scaling policies
- Optimizer Service: Provides ML-based scaling decisions (Python/FastAPI)
- Prometheus: Collects and stores metrics
Deploy All Jobs (Complete Stack)
# Set version variable
export VERSION="1.0.0"
# Deploy in order
nomad job run -var="VERSION=${VERSION}" manager.nomad.hcl
nomad job run pocketid.nomad.hcl
nomad job run proxy.nomad.hcl
nomad job run -var="VERSION=${VERSION}" optimizer.nomad.hcl
nomad job run autoscaler.nomad.hcl
# Verify all jobs are running
nomad job statusRolling Update
# Set version variable
export VERSION="1.0.0"
# Deploy jobs
nomad job run -var="VERSION=${VERSION}" manager.nomad.hcl
nomad job run -var="VERSION=${VERSION}" optimizer.nomad.hcl
nomad job run proxy.nomad.hcl
nomad job run autoscaler.nomad.hclRolling Update
# Update Manager
nomad job run -check-index $(nomad job inspect manager | jq .JobModifyIndex) manager.nomad.hcl
# Monitor deployment
nomad deployment status <deployment-id>
# Watch deployment progress
watch -n 2 'nomad job status manager | head -20'Rollback
# View job versions
nomad job history manager
# Revert to previous version
nomad job revert manager <version>
# Stop a failed deployment
nomad deployment fail <deployment-id>Monitoring and Verification
Nomad Web UI
Access the Nomad UI at http://localhost:4646:
- Jobs: View all running jobs, their status, and allocations
- Allocations: See details of each running container instance
- Nodes: Infrastructure information and resource usage
- Topology: Visual overview of the cluster state
- Evaluations: View scheduling decisions
Check Job Health
# Job overview
nomad job status manager
# Specific allocation details
nomad alloc status <alloc-id>
# Service health checks
nomad service info manager-api
# Recent job history
nomad job history manager
# View deployment status
nomad deployment status <deployment-id>Prometheus Metrics
Access Prometheus at http://localhost:9090 and query metrics:
# Manager API availability
curl 'http://localhost:9090/api/v1/query?query=up{job="manager"}'
# CPU usage across cluster
curl 'http://localhost:9090/api/v1/query?query=nomad_client_host_cpu_user'
# Memory usage per allocation
curl 'http://localhost:9090/api/v1/query?query=nomad_client_alloc_memory_usage'
# HTTP request rate (if exposed by app)
curl 'http://localhost:9090/api/v1/query?query=rate(http_requests_total[5m])'
# Active connections
curl 'http://localhost:9090/api/v1/query?query=nomad_client_alloc_network_rx_bytes'Test Autoscaler
Verify autoscaler functionality by generating load:
# Install Apache Bench (if not already installed)
# Ubuntu/Debian: apt-get install apache2-utils
# macOS: brew install ab
# Generate load
ab -n 10000 -c 100 http://localhost:8080/api/health
# Watch scaling events in real-time
nomad job history manager
# Monitor allocation count changes
watch -n 2 'nomad job status manager | grep "Allocations"'
# View autoscaler decision logs
nomad alloc logs -f <autoscaler-alloc-id> autoscaler
# Check optimizer service health
curl http://localhost:8000/health
# View recent scaling decisions
curl http://localhost:8000/metrics | grep scalingService Discovery Verification
# List all registered services
nomad service list
# Get specific service information
nomad service info manager-apiHigh Availability
Executor Instance
Only one Executor instance should run. The Executor uses database-level locking to ensure single-instance execution across the cluster.
Increase the count for API instances to scale horizontally:
group "api" {
count = 5 # Scale as needed
# ...
}Troubleshooting
Job Won't Start
# Validate job file syntax
nomad job validate <job-file>.hcl
# Get detailed error message
nomad job status <job-name>
# Check allocation-level errors
nomad alloc status <alloc-id>
# View allocation logs
nomad alloc logs <alloc-id> <task-name>
# Check Docker driver status
docker ps
docker imagesNetwork Connectivity Issues
# Check service discovery
nomad service list
nomad service info <service-name>
# Verify port mappings
nomad alloc status <alloc-id> | grep -A 10 "Ports"
# Enter container for debugging
nomad alloc exec -task <task-name> <alloc-id> /bin/sh
# Test connectivity from inside container
nomad alloc exec <alloc-id> curl http://localhost:8080/health
nomad alloc exec <alloc-id> ping postgresDatabase Connection Problems
# Test database connectivity
nomad alloc exec <postgres-alloc-id> psql -U postgres -d productify -c "SELECT version();"
# Check database logs
nomad alloc logs <postgres-alloc-id>Autoscaler Not Scaling
# Check scaling policies
nomad scaling policy list
# View policy details
nomad scaling policy info <policy-id>
# Test manual scaling
nomad job scale manager 5
# Verify Prometheus targets
curl http://localhost:9090/api/v1/targets
# Check optimizer logs
nomad alloc logs -f <optimizer-alloc-id> optimizer
# Verify autoscaler plugin
nomad alloc logs -f <autoscaler-alloc-id> autoscalerHigh Resource Usage
# Check cluster resources
nomad node status
# View resource allocation
nomad status
# Check specific node usage
nomad node status <node-id>
# See resource constraints
nomad job inspect manager | jq '.Job.TaskGroups[].Tasks[].Resources'Check Job Status
nomad job status manager
nomad alloc status <alloc-id>
nomad alloc logs <alloc-id>Networking Issues
# Check allocations
nomad alloc status -verbose <alloc-id>
# Exec into container
nomad alloc exec <alloc-id> shAdvanced Configuration
Auto-Revert on Failure
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "5m"
auto_revert = true
auto_promote = false
}Canary Deployments
update {
max_parallel = 1
canary = 1
min_healthy_time = "30s"
healthy_deadline = "10m"
auto_promote = false
auto_revert = true
}Resource Limits
resources {
cpu = 1000 # MHz
memory = 1024 # MB
memory_max = 2048 # Max memory before OOM
}Spread Across Nodes
spread {
attribute = "${node.unique.id}"
weight = 100
}