Operations

Cheatsheet & Quick Reference

One-liners and short command sequences for the most common DevOps tasks. All bash commands. PowerShell variants noted where they differ.

Set environment shortcuts before running commands:
ENV=production REGION=ap-south-1 CLUSTER=zelly-production
ENV=staging REGION=ap-southeast-1 CLUSTER=zelly-staging

ECS & Deployments

bash — service status

# All services status
aws ecs describe-services \
  --cluster $CLUSTER --services fastify-nova customer-panel orion-backend events-consumer storefront \
  --region $REGION \
  --query 'services[*].{name:serviceName,desired:desiredCount,running:runningCount,status:status,deploy:deployments[0].rolloutState}'

# Single service detail
aws ecs describe-services --cluster $CLUSTER --services fastify-nova \
  --region $REGION --query 'services[0]'

bash — force redeploy

# Single service
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
  --force-new-deployment --region $REGION

# All services at once
for svc in fastify-nova customer-panel orion-backend storefront; do
  aws ecs update-service --cluster $CLUSTER --service $svc \
    --force-new-deployment --region $REGION
done

bash — scale service

# Scale to specific count
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
  --desired-count 2 --region $REGION

# Scale to 0 (stop service)
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
  --desired-count 0 --region $REGION

bash — rollback (previous revision)

# Get current task def
CURRENT=$(aws ecs describe-services --cluster $CLUSTER --services fastify-nova \
  --region $REGION --query 'services[0].taskDefinition' --output text)
echo "Current: $CURRENT"

# Roll back one revision
REV=$(echo $CURRENT | grep -o '[0-9]*$')
PREV="${CURRENT%:*}:$((REV - 1))"
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
  --task-definition $PREV --force-new-deployment --region $REGION

bash — check stopped tasks

# List stopped task ARNs
STOPPED=$(aws ecs list-tasks --cluster $CLUSTER --service-name fastify-nova \
  --desired-status STOPPED --region $REGION --query 'taskArns[0:3]' --output text)

# Describe stopped tasks
aws ecs describe-tasks --cluster $CLUSTER --tasks $STOPPED --region $REGION \
  --query 'tasks[*].{stopped:stoppedReason,containers:containers[*].{name:name,exit:exitCode,reason:reason}}'

bash — running tasks with image tags

TASKS=$(aws ecs list-tasks --cluster $CLUSTER --service-name fastify-nova \
  --desired-status RUNNING --region $REGION --query 'taskArns' --output text)
aws ecs describe-tasks --cluster $CLUSTER --tasks $TASKS --region $REGION \
  --query 'tasks[*].{id:taskArn,td:taskDefinitionArn,containers:containers[*].{name:name,image:image,status:lastStatus}}'

ECR & Docker Images

bash — list recent images

# Recent images for a service (always ap-south-1 for ECR)
aws ecr describe-images --repository-name zelly/fastify-nova \
  --region ap-south-1 --filter tagStatus=TAGGED \
  --query 'sort_by(imageDetails, &imagePushedAt)[-10:].{tags:imageTags,pushed:imagePushedAt,sizeMB:to_string(imageSizeInBytes)}' \
  --output table

bash — ECR login and push manually

AWS_ACCOUNT=279391564627
ECR_REGION=ap-south-1
REPO=$AWS_ACCOUNT.dkr.ecr.$ECR_REGION.amazonaws.com

# Login
aws ecr get-login-password --region $ECR_REGION | \
  docker login --username AWS --password-stdin $REPO

# Tag and push
docker build -t $REPO/zelly/fastify-nova:manual-$(git rev-parse --short HEAD) .
docker push $REPO/zelly/fastify-nova:manual-$(git rev-parse --short HEAD)

bash — re-tag image (promote staging to prod)

# Get the manifest of the staging image
MANIFEST=$(aws ecr batch-get-image \
  --repository-name zelly/fastify-nova \
  --image-ids imageTag=staging-abc12345 \
  --region ap-south-1 \
  --query 'images[0].imageManifest' --output text)

# Push with a new tag
aws ecr put-image \
  --repository-name zelly/fastify-nova \
  --image-tag prod-abc12345 \
  --image-manifest "$MANIFEST" \
  --region ap-south-1

CloudWatch Logs

bash — tail recent logs

# Last 15 minutes of logs
aws logs filter-log-events \
  --log-group-name /zelly/ecs/fastify-nova \
  --start-time $(( $(date +%s) * 1000 - 900000 )) \
  --region $REGION \
  --query 'events[*].message' --output text

# Filter by pattern (case insensitive grep)
aws logs filter-log-events \
  --log-group-name /zelly/ecs/fastify-nova \
  --start-time $(( $(date +%s) * 1000 - 3600000 )) \
  --filter-pattern '"ERROR"' \
  --region $REGION \
  --query 'events[*].{ts:timestamp,msg:message}' --output table

PowerShell — tail recent logs

$start = [DateTimeOffset]::UtcNow.AddMinutes(-15).ToUnixTimeMilliseconds()
aws logs filter-log-events `
  --log-group-name /zelly/ecs/fastify-nova `
  --start-time $start `
  --region $env:REGION `
  --query 'events[*].message' --output text

CloudWatch Logs Insights — paste in console

# All errors across all Zelly services
fields @timestamp, @logStream, @message
| filter @message like /(?i)(error|exception|fatal|crash)/
| sort @timestamp desc
| limit 100

# Requests slower than 1 second
fields @timestamp, @message
| parse @message '"duration":*,' as duration
| filter ispresent(duration) and duration > 1000
| sort @timestamp desc
| limit 50

# Service startup events
fields @timestamp, @message
| filter @message like /(?i)(listen|started|ready|connected)/
| sort @timestamp desc
| limit 30

# Specific request ID trace
fields @timestamp, @message
| filter @message like /REQUEST_ID_HERE/
| sort @timestamp asc

AWS Secrets Manager

bash — read / update secrets

# Read a secret (pretty-printed)
aws secretsmanager get-secret-value \
  --secret-id zelly/fastify-nova/env \
  --region $REGION \
  --query 'SecretString' --output text | python3 -m json.tool

# Update a single key (merge, don't replace)
CURRENT=$(aws secretsmanager get-secret-value \
  --secret-id zelly/fastify-nova/env --region $REGION \
  --query 'SecretString' --output text)
# Edit in a temp file, then:
aws secretsmanager put-secret-value \
  --secret-id zelly/fastify-nova/env \
  --secret-string "$UPDATED_JSON" \
  --region $REGION

# Create a new secret
aws secretsmanager create-secret \
  --name zelly/new-service/env \
  --secret-string '{"KEY":"value"}' \
  --region $REGION

# Delete secret (bypass 7-day window)
aws secretsmanager delete-secret \
  --secret-id zelly/old-service/env \
  --force-delete-without-recovery \
  --region $REGION

bash — list all Zelly secrets

aws secretsmanager list-secrets \
  --filter Key=name,Values=zelly/ \
  --region $REGION \
  --query 'SecretList[*].{name:Name,arn:ARN,last:LastChangedDate}' \
  --output table

ECS does not hot-reload secrets. Always force-redeploy the affected service after updating a secret value.

Aurora MySQL

bash — cluster status

# Cluster status (shows writer/reader, ACU range, engine)
aws rds describe-db-clusters \
  --db-cluster-identifier zelly-aurora-$ENV \
  --region $REGION \
  --query 'DBClusters[0].{status:Status,engine:EngineVersion,endpoint:Endpoint,reader:ReaderEndpoint,min:ServerlessV2ScalingConfiguration.MinCapacity,max:ServerlessV2ScalingConfiguration.MaxCapacity}'

# List DB cluster instances
aws rds describe-db-instances \
  --filters Name=db-cluster-id,Values=zelly-aurora-$ENV \
  --region $REGION \
  --query 'DBInstances[*].{id:DBInstanceIdentifier,class:DBInstanceClass,status:DBInstanceStatus}'

bash — connect to Aurora

# Staging: direct (no VPN)
AURORA_HOST=$(aws rds describe-db-clusters \
  --db-cluster-identifier zelly-aurora-staging \
  --region ap-southeast-1 \
  --query 'DBClusters[0].Endpoint' --output text)
mysql -h $AURORA_HOST -u admin -p

# Production: WireGuard VPN required first!
# wg show wg0  → should show active handshake
AURORA_HOST=$(aws rds describe-db-clusters \
  --db-cluster-identifier zelly-aurora-production \
  --region ap-south-1 \
  --query 'DBClusters[0].Endpoint' --output text)
mysql -h $AURORA_HOST -u admin -p

mysql — useful queries

-- List all schemas
SHOW DATABASES;

-- Check connection count by user/host
SELECT USER, HOST, COUNT(*) as cnt
FROM information_schema.PROCESSLIST
GROUP BY USER, HOST ORDER BY cnt DESC;

-- Kill idle connections
SELECT CONCAT('KILL ', ID, ';') FROM information_schema.PROCESSLIST
WHERE COMMAND = 'Sleep' AND TIME > 300;

-- Check table sizes
SELECT table_schema, table_name,
  ROUND(data_length/1024/1024, 1) as data_MB,
  ROUND(index_length/1024/1024, 1) as index_MB
FROM information_schema.TABLES
WHERE table_schema IN ('astro_primary','ecom_store_front','backoffice')
ORDER BY data_MB DESC LIMIT 20;

-- Check slow query log (if enabled)
SHOW VARIABLES LIKE 'slow_query_log%';

-- Show running queries
SHOW FULL PROCESSLIST;
KILL [process_id];

-- Migration state (TypeORM)
SELECT * FROM astro_primary.migrations ORDER BY timestamp DESC LIMIT 10;

bash — take manual snapshot

aws rds create-db-cluster-snapshot \
  --db-cluster-identifier zelly-aurora-production \
  --db-cluster-snapshot-identifier manual-$(date +%Y%m%d-%H%M) \
  --region ap-south-1

Redis & BullMQ

bash — ElastiCache status

aws elasticache describe-replication-groups \
  --replication-group-id zelly-redis-$ENV \
  --region $REGION \
  --query 'ReplicationGroups[0].{status:Status,nodeGroups:NodeGroups[0].PrimaryEndpoint}'

bash — connect to Redis (via WireGuard VPN for production)

# Get endpoint
REDIS_ENDPOINT=$(aws elasticache describe-replication-groups \
  --replication-group-id zelly-redis-$ENV --region $REGION \
  --query 'ReplicationGroups[0].NodeGroups[0].PrimaryEndpoint.Address' --output text)

# Connect (TLS required for ElastiCache)
REDIS_AUTH=$(aws secretsmanager get-secret-value \
  --secret-id zelly/redis/auth --region $REGION \
  --query 'SecretString' --output text | python3 -c "import sys,json; print(json.load(sys.stdin)['auth_token'])")

redis-cli -h $REDIS_ENDPOINT -p 6379 -a $REDIS_AUTH --tls

redis-cli — useful commands

# Queue depths (BullMQ)
LLEN "bull:store-events:wait"
LLEN "bull:SHOPIFY_WEBHOOK:wait"
LLEN "bull:SHOPIFY_CATALOG_SYNC:wait"

# All BullMQ queues
KEYS "bull:*:wait"

# Jobs in failed state
LLEN "bull:store-events:failed"

# Check memory usage
INFO memory

# Check connected clients
INFO clients

# Monitor live commands (dev only — high overhead)
MONITOR

WireGuard VPN

bash — VPN status and setup

# Check if WireGuard is running
wg show wg0

# Bring up VPN
wg-quick up wg0

# Bring down VPN
wg-quick down wg0

# Check your VPN IP
ip addr show wg0

# Test connectivity to production private resources
ping -c 3 10.0.0.1         # VPN gateway
nc -zv 10.0.x.x 3306 -w 3  # Aurora MySQL
nc -zv 10.0.x.x 6379 -w 3  # Redis
nc -zv 10.0.x.x 8123 -w 3  # ClickHouse HTTP

PowerShell — VPN status and setup

# Bring up VPN
wg-quick up wg0

# Check status
wg show wg0

# Check VPN IP
ipconfig | Select-String "WireGuard" -Context 0,5

bash — add new WireGuard peer via AWS SSM

# Generate keypair on bastion
aws ssm send-command \
  --instance-id i-BASTION_INSTANCE_ID \
  --document-name AWS-RunShellScript \
  --parameters '{"commands":["wg genkey | tee /tmp/priv | wg pubkey"]}' \
  --region ap-south-1 \
  --query 'Command.CommandId' --output text

# Or use zelly-ops VPN section to generate and register peers

Terraform

bash — standard workflow

cd d:/zelly/terraform           # Production
cd d:/zelly/terraform/environments/staging  # Staging

# Plan before applying
terraform plan -out tfplan

# Apply
terraform apply tfplan

# Target a specific resource
terraform apply -target module.ecs_cluster -target module.services.fastify_nova

# Refresh state without applying
terraform refresh

# Show all outputs
terraform output

bash — state management

# List all resources in state
terraform state list

# Show a specific resource
terraform state show module.services.aws_ecs_service.fastify_nova

# Import existing resource into state
terraform import aws_secretsmanager_secret.example arn:aws:...

# Remove resource from state (does NOT destroy the resource)
terraform state rm module.old_resource.aws_whatever.name

# Force unlock (if state is locked)
terraform force-unlock LOCK_ID

bash — backend bootstrap (one-time)

# Create S3 state bucket (ap-south-1)
aws s3api create-bucket \
  --bucket zelly-terraform-state \
  --region ap-south-1 \
  --create-bucket-configuration LocationConstraint=ap-south-1

aws s3api put-bucket-versioning \
  --bucket zelly-terraform-state \
  --versioning-configuration Status=Enabled

# Create DynamoDB lock table
aws dynamodb create-table \
  --table-name zelly-terraform-locks \
  --attribute-definitions AttributeName=LockID,AttributeType=S \
  --key-schema AttributeName=LockID,KeyType=HASH \
  --billing-mode PAY_PER_REQUEST \
  --region ap-south-1

After any Terraform infrastructure change, ECS services won't auto-redeploy (lifecycle ignore_changes). Always aws ecs update-service --force-new-deployment after infrastructure applies.

Git Release Flow

bash — staging deploy (auto on push to main)

# Deploy to staging by merging to main
git checkout main
git pull origin main
git merge --no-ff feature/my-feature
git push origin main

# GitHub Actions auto-triggers on push to main
# Watch: https://github.com/hex406/REPO/actions

bash — production release (create a v* tag)

# Always deploy staging first and verify!

# Create a production release tag
git checkout main
git pull origin main
git tag v2.1.0
git push origin v2.1.0

# GitHub Actions triggers on v* tags
# Or trigger via zelly-ops GitHub section

bash — hotfix flow

# 1. Branch from main (which is the current production state)
git checkout main && git pull
git checkout -b hotfix/payment-fix

# 2. Make the fix, commit
git add -p
git commit -m "fix: resolve payment signature mismatch"

# 3. Merge to main and tag immediately
git checkout main
git merge --no-ff hotfix/payment-fix
git push origin main

git tag v2.0.1
git push origin v2.0.1

# 4. Verify production deployment in GitHub Actions and monitor

bash — useful git shortcuts

# Check what's deployed (image tag = commit sha)
aws ecs describe-tasks --cluster zelly-production \
  --tasks $(aws ecs list-tasks --cluster zelly-production --service-name fastify-nova \
  --desired-status RUNNING --region ap-south-1 --query 'taskArns[0]' --output text) \
  --region ap-south-1 \
  --query 'tasks[0].containers[0].image' --output text

# SHA from that tag → find commit
git log --oneline | grep abc12345

# See what changed between deployed version and HEAD
git log --oneline staging-abc12345..HEAD

ClickHouse

bash — connect and query (WireGuard required)

# HTTP API (preferred for scripts)
curl -s "http://10.0.x.x:8123/?query=SELECT+version()"

# Interactive client
clickhouse-client --host 10.0.x.x --port 9000

# Check tables
curl -s "http://10.0.x.x:8123/?query=SHOW+TABLES+FROM+zelly_analytics"

# Row counts
curl -s "http://10.0.x.x:8123/?query=SELECT+table,count()+FROM+system.parts+WHERE+database='zelly_analytics'+GROUP+BY+table"

# Insert test
curl -X POST "http://10.0.x.x:8123/?query=INSERT+INTO+zelly_analytics.events+FORMAT+JSONEachRow" \
  --data '{"event_type":"test","timestamp":"2024-01-01 00:00:00"}'

bash — ClickHouse EC2 management

# Find ClickHouse instance ID
aws ec2 describe-instances \
  --filters "Name=tag:Name,Values=zelly-clickhouse" \
  --region ap-south-1 \
  --query 'Reservations[*].Instances[*].{id:InstanceId,state:State.Name,ip:PrivateIpAddress}'

# Connect via bastion SSH tunnel (or use WireGuard)
ssh -i ~/.ssh/zelly-bastion.pem -L 8123:CLICKHOUSE_PRIVATE_IP:8123 \
  ec2-user@BASTION_PUBLIC_IP

# Check Docker container on ClickHouse EC2 (via SSM)
aws ssm send-command \
  --instance-id CLICKHOUSE_INSTANCE_ID \
  --document-name AWS-RunShellScript \
  --parameters '{"commands":["docker ps","docker logs clickhouse --tail 50"]}' \
  --region ap-south-1

Docker — Local Dev

bash — zdev (local docker-compose)

# Alias setup (add to ~/.bashrc or ~/.zshrc)
alias zdev='docker compose -f terraform/local-dev/docker-compose.yml --project-directory . --env-file terraform/local-dev/.env'

# Start all services
zdev up -d

# Start specific services only
zdev up -d fastify-nova customer-panel

# View logs
zdev logs -f fastify-nova

# Rebuild a service
zdev up -d --build fastify-nova

# Stop everything
zdev down

# Full reset (remove volumes)
zdev down -v

# Shell into a container
zdev exec fastify-nova sh

PowerShell — zdev function

# Add to $PROFILE
function zdev { docker compose -f terraform/local-dev/docker-compose.yml --project-directory . --env-file terraform/local-dev/.env @args }

# Usage (same as bash)
zdev up -d
zdev logs -f fastify-nova
zdev down

Miscellaneous

bash — AWS identity check

# Which AWS account/role am I using?
aws sts get-caller-identity

# Switch to a different profile
export AWS_PROFILE=zelly-production
aws sts get-caller-identity

bash — CloudWatch alarms

# List alarms in ALARM state
aws cloudwatch describe-alarms \
  --state-value ALARM \
  --region $REGION \
  --query 'MetricAlarms[*].{name:AlarmName,metric:MetricName,reason:StateReason}'

# List ALL alarms with state
aws cloudwatch describe-alarms \
  --region $REGION \
  --query 'MetricAlarms[*].{name:AlarmName,state:StateValue,metric:MetricName}'

bash — check ALB target health

# List all target groups with health
aws elbv2 describe-target-groups \
  --region $REGION \
  --query 'TargetGroups[*].{name:TargetGroupName,arn:TargetGroupArn}' \
  --output table

# Check health for a specific target group
aws elbv2 describe-target-health \
  --target-group-arn TG_ARN \
  --region $REGION

bash — check running image tag for all services

for svc in fastify-nova customer-panel orion-backend events-consumer storefront; do
  TASK=$(aws ecs list-tasks --cluster $CLUSTER --service-name $svc \
    --desired-status RUNNING --region $REGION --query 'taskArns[0]' --output text 2>/dev/null)
  if [ -n "$TASK" ] && [ "$TASK" != "None" ]; then
    IMG=$(aws ecs describe-tasks --cluster $CLUSTER --tasks $TASK --region $REGION \
      --query 'tasks[0].containers[0].image' --output text 2>/dev/null)
    echo "$svc: $IMG"
  else
    echo "$svc: NOT RUNNING"
  fi
done