Operations
Cheatsheet & Quick Reference
One-liners and short command sequences for the most common DevOps tasks. All bash commands. PowerShell variants noted where they differ.
Set environment shortcuts before running commands:
ENV=production REGION=ap-south-1 CLUSTER=zelly-productionENV=staging REGION=ap-southeast-1 CLUSTER=zelly-staging
ECS & Deployments
bash — service status
# All services status
aws ecs describe-services \
--cluster $CLUSTER --services fastify-nova customer-panel orion-backend events-consumer storefront \
--region $REGION \
--query 'services[*].{name:serviceName,desired:desiredCount,running:runningCount,status:status,deploy:deployments[0].rolloutState}'
# Single service detail
aws ecs describe-services --cluster $CLUSTER --services fastify-nova \
--region $REGION --query 'services[0]'bash — force redeploy
# Single service
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
--force-new-deployment --region $REGION
# All services at once
for svc in fastify-nova customer-panel orion-backend storefront; do
aws ecs update-service --cluster $CLUSTER --service $svc \
--force-new-deployment --region $REGION
donebash — scale service
# Scale to specific count aws ecs update-service --cluster $CLUSTER --service fastify-nova \ --desired-count 2 --region $REGION # Scale to 0 (stop service) aws ecs update-service --cluster $CLUSTER --service fastify-nova \ --desired-count 0 --region $REGION
bash — rollback (previous revision)
# Get current task def
CURRENT=$(aws ecs describe-services --cluster $CLUSTER --services fastify-nova \
--region $REGION --query 'services[0].taskDefinition' --output text)
echo "Current: $CURRENT"
# Roll back one revision
REV=$(echo $CURRENT | grep -o '[0-9]*$')
PREV="${CURRENT%:*}:$((REV - 1))"
aws ecs update-service --cluster $CLUSTER --service fastify-nova \
--task-definition $PREV --force-new-deployment --region $REGIONbash — check stopped tasks
# List stopped task ARNs
STOPPED=$(aws ecs list-tasks --cluster $CLUSTER --service-name fastify-nova \
--desired-status STOPPED --region $REGION --query 'taskArns[0:3]' --output text)
# Describe stopped tasks
aws ecs describe-tasks --cluster $CLUSTER --tasks $STOPPED --region $REGION \
--query 'tasks[*].{stopped:stoppedReason,containers:containers[*].{name:name,exit:exitCode,reason:reason}}'bash — running tasks with image tags
TASKS=$(aws ecs list-tasks --cluster $CLUSTER --service-name fastify-nova \
--desired-status RUNNING --region $REGION --query 'taskArns' --output text)
aws ecs describe-tasks --cluster $CLUSTER --tasks $TASKS --region $REGION \
--query 'tasks[*].{id:taskArn,td:taskDefinitionArn,containers:containers[*].{name:name,image:image,status:lastStatus}}'ECR & Docker Images
bash — list recent images
# Recent images for a service (always ap-south-1 for ECR)
aws ecr describe-images --repository-name zelly/fastify-nova \
--region ap-south-1 --filter tagStatus=TAGGED \
--query 'sort_by(imageDetails, &imagePushedAt)[-10:].{tags:imageTags,pushed:imagePushedAt,sizeMB:to_string(imageSizeInBytes)}' \
--output tablebash — ECR login and push manually
AWS_ACCOUNT=279391564627 ECR_REGION=ap-south-1 REPO=$AWS_ACCOUNT.dkr.ecr.$ECR_REGION.amazonaws.com # Login aws ecr get-login-password --region $ECR_REGION | \ docker login --username AWS --password-stdin $REPO # Tag and push docker build -t $REPO/zelly/fastify-nova:manual-$(git rev-parse --short HEAD) . docker push $REPO/zelly/fastify-nova:manual-$(git rev-parse --short HEAD)
bash — re-tag image (promote staging to prod)
# Get the manifest of the staging image MANIFEST=$(aws ecr batch-get-image \ --repository-name zelly/fastify-nova \ --image-ids imageTag=staging-abc12345 \ --region ap-south-1 \ --query 'images[0].imageManifest' --output text) # Push with a new tag aws ecr put-image \ --repository-name zelly/fastify-nova \ --image-tag prod-abc12345 \ --image-manifest "$MANIFEST" \ --region ap-south-1
CloudWatch Logs
bash — tail recent logs
# Last 15 minutes of logs
aws logs filter-log-events \
--log-group-name /zelly/ecs/fastify-nova \
--start-time $(( $(date +%s) * 1000 - 900000 )) \
--region $REGION \
--query 'events[*].message' --output text
# Filter by pattern (case insensitive grep)
aws logs filter-log-events \
--log-group-name /zelly/ecs/fastify-nova \
--start-time $(( $(date +%s) * 1000 - 3600000 )) \
--filter-pattern '"ERROR"' \
--region $REGION \
--query 'events[*].{ts:timestamp,msg:message}' --output tablePowerShell — tail recent logs
$start = [DateTimeOffset]::UtcNow.AddMinutes(-15).ToUnixTimeMilliseconds() aws logs filter-log-events ` --log-group-name /zelly/ecs/fastify-nova ` --start-time $start ` --region $env:REGION ` --query 'events[*].message' --output text
CloudWatch Logs Insights — paste in console
# All errors across all Zelly services fields @timestamp, @logStream, @message | filter @message like /(?i)(error|exception|fatal|crash)/ | sort @timestamp desc | limit 100 # Requests slower than 1 second fields @timestamp, @message | parse @message '"duration":*,' as duration | filter ispresent(duration) and duration > 1000 | sort @timestamp desc | limit 50 # Service startup events fields @timestamp, @message | filter @message like /(?i)(listen|started|ready|connected)/ | sort @timestamp desc | limit 30 # Specific request ID trace fields @timestamp, @message | filter @message like /REQUEST_ID_HERE/ | sort @timestamp asc
AWS Secrets Manager
bash — read / update secrets
# Read a secret (pretty-printed)
aws secretsmanager get-secret-value \
--secret-id zelly/fastify-nova/env \
--region $REGION \
--query 'SecretString' --output text | python3 -m json.tool
# Update a single key (merge, don't replace)
CURRENT=$(aws secretsmanager get-secret-value \
--secret-id zelly/fastify-nova/env --region $REGION \
--query 'SecretString' --output text)
# Edit in a temp file, then:
aws secretsmanager put-secret-value \
--secret-id zelly/fastify-nova/env \
--secret-string "$UPDATED_JSON" \
--region $REGION
# Create a new secret
aws secretsmanager create-secret \
--name zelly/new-service/env \
--secret-string '{"KEY":"value"}' \
--region $REGION
# Delete secret (bypass 7-day window)
aws secretsmanager delete-secret \
--secret-id zelly/old-service/env \
--force-delete-without-recovery \
--region $REGIONbash — list all Zelly secrets
aws secretsmanager list-secrets \
--filter Key=name,Values=zelly/ \
--region $REGION \
--query 'SecretList[*].{name:Name,arn:ARN,last:LastChangedDate}' \
--output table
ECS does not hot-reload secrets. Always force-redeploy the affected service after updating a secret value.
Aurora MySQL
bash — cluster status
# Cluster status (shows writer/reader, ACU range, engine)
aws rds describe-db-clusters \
--db-cluster-identifier zelly-aurora-$ENV \
--region $REGION \
--query 'DBClusters[0].{status:Status,engine:EngineVersion,endpoint:Endpoint,reader:ReaderEndpoint,min:ServerlessV2ScalingConfiguration.MinCapacity,max:ServerlessV2ScalingConfiguration.MaxCapacity}'
# List DB cluster instances
aws rds describe-db-instances \
--filters Name=db-cluster-id,Values=zelly-aurora-$ENV \
--region $REGION \
--query 'DBInstances[*].{id:DBInstanceIdentifier,class:DBInstanceClass,status:DBInstanceStatus}'bash — connect to Aurora
# Staging: direct (no VPN) AURORA_HOST=$(aws rds describe-db-clusters \ --db-cluster-identifier zelly-aurora-staging \ --region ap-southeast-1 \ --query 'DBClusters[0].Endpoint' --output text) mysql -h $AURORA_HOST -u admin -p # Production: WireGuard VPN required first! # wg show wg0 → should show active handshake AURORA_HOST=$(aws rds describe-db-clusters \ --db-cluster-identifier zelly-aurora-production \ --region ap-south-1 \ --query 'DBClusters[0].Endpoint' --output text) mysql -h $AURORA_HOST -u admin -p
mysql — useful queries
-- List all schemas
SHOW DATABASES;
-- Check connection count by user/host
SELECT USER, HOST, COUNT(*) as cnt
FROM information_schema.PROCESSLIST
GROUP BY USER, HOST ORDER BY cnt DESC;
-- Kill idle connections
SELECT CONCAT('KILL ', ID, ';') FROM information_schema.PROCESSLIST
WHERE COMMAND = 'Sleep' AND TIME > 300;
-- Check table sizes
SELECT table_schema, table_name,
ROUND(data_length/1024/1024, 1) as data_MB,
ROUND(index_length/1024/1024, 1) as index_MB
FROM information_schema.TABLES
WHERE table_schema IN ('astro_primary','ecom_store_front','backoffice')
ORDER BY data_MB DESC LIMIT 20;
-- Check slow query log (if enabled)
SHOW VARIABLES LIKE 'slow_query_log%';
-- Show running queries
SHOW FULL PROCESSLIST;
KILL [process_id];
-- Migration state (TypeORM)
SELECT * FROM astro_primary.migrations ORDER BY timestamp DESC LIMIT 10;bash — take manual snapshot
aws rds create-db-cluster-snapshot \ --db-cluster-identifier zelly-aurora-production \ --db-cluster-snapshot-identifier manual-$(date +%Y%m%d-%H%M) \ --region ap-south-1
Redis & BullMQ
bash — ElastiCache status
aws elasticache describe-replication-groups \
--replication-group-id zelly-redis-$ENV \
--region $REGION \
--query 'ReplicationGroups[0].{status:Status,nodeGroups:NodeGroups[0].PrimaryEndpoint}'bash — connect to Redis (via WireGuard VPN for production)
# Get endpoint REDIS_ENDPOINT=$(aws elasticache describe-replication-groups \ --replication-group-id zelly-redis-$ENV --region $REGION \ --query 'ReplicationGroups[0].NodeGroups[0].PrimaryEndpoint.Address' --output text) # Connect (TLS required for ElastiCache) REDIS_AUTH=$(aws secretsmanager get-secret-value \ --secret-id zelly/redis/auth --region $REGION \ --query 'SecretString' --output text | python3 -c "import sys,json; print(json.load(sys.stdin)['auth_token'])") redis-cli -h $REDIS_ENDPOINT -p 6379 -a $REDIS_AUTH --tls
redis-cli — useful commands
# Queue depths (BullMQ) LLEN "bull:store-events:wait" LLEN "bull:SHOPIFY_WEBHOOK:wait" LLEN "bull:SHOPIFY_CATALOG_SYNC:wait" # All BullMQ queues KEYS "bull:*:wait" # Jobs in failed state LLEN "bull:store-events:failed" # Check memory usage INFO memory # Check connected clients INFO clients # Monitor live commands (dev only — high overhead) MONITOR
WireGuard VPN
bash — VPN status and setup
# Check if WireGuard is running wg show wg0 # Bring up VPN wg-quick up wg0 # Bring down VPN wg-quick down wg0 # Check your VPN IP ip addr show wg0 # Test connectivity to production private resources ping -c 3 10.0.0.1 # VPN gateway nc -zv 10.0.x.x 3306 -w 3 # Aurora MySQL nc -zv 10.0.x.x 6379 -w 3 # Redis nc -zv 10.0.x.x 8123 -w 3 # ClickHouse HTTP
PowerShell — VPN status and setup
# Bring up VPN wg-quick up wg0 # Check status wg show wg0 # Check VPN IP ipconfig | Select-String "WireGuard" -Context 0,5
bash — add new WireGuard peer via AWS SSM
# Generate keypair on bastion
aws ssm send-command \
--instance-id i-BASTION_INSTANCE_ID \
--document-name AWS-RunShellScript \
--parameters '{"commands":["wg genkey | tee /tmp/priv | wg pubkey"]}' \
--region ap-south-1 \
--query 'Command.CommandId' --output text
# Or use zelly-ops VPN section to generate and register peersTerraform
bash — standard workflow
cd d:/zelly/terraform # Production cd d:/zelly/terraform/environments/staging # Staging # Plan before applying terraform plan -out tfplan # Apply terraform apply tfplan # Target a specific resource terraform apply -target module.ecs_cluster -target module.services.fastify_nova # Refresh state without applying terraform refresh # Show all outputs terraform output
bash — state management
# List all resources in state terraform state list # Show a specific resource terraform state show module.services.aws_ecs_service.fastify_nova # Import existing resource into state terraform import aws_secretsmanager_secret.example arn:aws:... # Remove resource from state (does NOT destroy the resource) terraform state rm module.old_resource.aws_whatever.name # Force unlock (if state is locked) terraform force-unlock LOCK_ID
bash — backend bootstrap (one-time)
# Create S3 state bucket (ap-south-1) aws s3api create-bucket \ --bucket zelly-terraform-state \ --region ap-south-1 \ --create-bucket-configuration LocationConstraint=ap-south-1 aws s3api put-bucket-versioning \ --bucket zelly-terraform-state \ --versioning-configuration Status=Enabled # Create DynamoDB lock table aws dynamodb create-table \ --table-name zelly-terraform-locks \ --attribute-definitions AttributeName=LockID,AttributeType=S \ --key-schema AttributeName=LockID,KeyType=HASH \ --billing-mode PAY_PER_REQUEST \ --region ap-south-1
After any Terraform infrastructure change, ECS services won't auto-redeploy (lifecycle ignore_changes). Always
aws ecs update-service --force-new-deployment after infrastructure applies.
Git Release Flow
bash — staging deploy (auto on push to main)
# Deploy to staging by merging to main git checkout main git pull origin main git merge --no-ff feature/my-feature git push origin main # GitHub Actions auto-triggers on push to main # Watch: https://github.com/hex406/REPO/actions
bash — production release (create a v* tag)
# Always deploy staging first and verify! # Create a production release tag git checkout main git pull origin main git tag v2.1.0 git push origin v2.1.0 # GitHub Actions triggers on v* tags # Or trigger via zelly-ops GitHub section
bash — hotfix flow
# 1. Branch from main (which is the current production state) git checkout main && git pull git checkout -b hotfix/payment-fix # 2. Make the fix, commit git add -p git commit -m "fix: resolve payment signature mismatch" # 3. Merge to main and tag immediately git checkout main git merge --no-ff hotfix/payment-fix git push origin main git tag v2.0.1 git push origin v2.0.1 # 4. Verify production deployment in GitHub Actions and monitor
bash — useful git shortcuts
# Check what's deployed (image tag = commit sha) aws ecs describe-tasks --cluster zelly-production \ --tasks $(aws ecs list-tasks --cluster zelly-production --service-name fastify-nova \ --desired-status RUNNING --region ap-south-1 --query 'taskArns[0]' --output text) \ --region ap-south-1 \ --query 'tasks[0].containers[0].image' --output text # SHA from that tag → find commit git log --oneline | grep abc12345 # See what changed between deployed version and HEAD git log --oneline staging-abc12345..HEAD
ClickHouse
bash — connect and query (WireGuard required)
# HTTP API (preferred for scripts)
curl -s "http://10.0.x.x:8123/?query=SELECT+version()"
# Interactive client
clickhouse-client --host 10.0.x.x --port 9000
# Check tables
curl -s "http://10.0.x.x:8123/?query=SHOW+TABLES+FROM+zelly_analytics"
# Row counts
curl -s "http://10.0.x.x:8123/?query=SELECT+table,count()+FROM+system.parts+WHERE+database='zelly_analytics'+GROUP+BY+table"
# Insert test
curl -X POST "http://10.0.x.x:8123/?query=INSERT+INTO+zelly_analytics.events+FORMAT+JSONEachRow" \
--data '{"event_type":"test","timestamp":"2024-01-01 00:00:00"}'bash — ClickHouse EC2 management
# Find ClickHouse instance ID
aws ec2 describe-instances \
--filters "Name=tag:Name,Values=zelly-clickhouse" \
--region ap-south-1 \
--query 'Reservations[*].Instances[*].{id:InstanceId,state:State.Name,ip:PrivateIpAddress}'
# Connect via bastion SSH tunnel (or use WireGuard)
ssh -i ~/.ssh/zelly-bastion.pem -L 8123:CLICKHOUSE_PRIVATE_IP:8123 \
ec2-user@BASTION_PUBLIC_IP
# Check Docker container on ClickHouse EC2 (via SSM)
aws ssm send-command \
--instance-id CLICKHOUSE_INSTANCE_ID \
--document-name AWS-RunShellScript \
--parameters '{"commands":["docker ps","docker logs clickhouse --tail 50"]}' \
--region ap-south-1Docker — Local Dev
bash — zdev (local docker-compose)
# Alias setup (add to ~/.bashrc or ~/.zshrc) alias zdev='docker compose -f terraform/local-dev/docker-compose.yml --project-directory . --env-file terraform/local-dev/.env' # Start all services zdev up -d # Start specific services only zdev up -d fastify-nova customer-panel # View logs zdev logs -f fastify-nova # Rebuild a service zdev up -d --build fastify-nova # Stop everything zdev down # Full reset (remove volumes) zdev down -v # Shell into a container zdev exec fastify-nova sh
PowerShell — zdev function
# Add to $PROFILE
function zdev { docker compose -f terraform/local-dev/docker-compose.yml --project-directory . --env-file terraform/local-dev/.env @args }
# Usage (same as bash)
zdev up -d
zdev logs -f fastify-nova
zdev downMiscellaneous
bash — AWS identity check
# Which AWS account/role am I using? aws sts get-caller-identity # Switch to a different profile export AWS_PROFILE=zelly-production aws sts get-caller-identity
bash — CloudWatch alarms
# List alarms in ALARM state
aws cloudwatch describe-alarms \
--state-value ALARM \
--region $REGION \
--query 'MetricAlarms[*].{name:AlarmName,metric:MetricName,reason:StateReason}'
# List ALL alarms with state
aws cloudwatch describe-alarms \
--region $REGION \
--query 'MetricAlarms[*].{name:AlarmName,state:StateValue,metric:MetricName}'bash — check ALB target health
# List all target groups with health
aws elbv2 describe-target-groups \
--region $REGION \
--query 'TargetGroups[*].{name:TargetGroupName,arn:TargetGroupArn}' \
--output table
# Check health for a specific target group
aws elbv2 describe-target-health \
--target-group-arn TG_ARN \
--region $REGIONbash — check running image tag for all services
for svc in fastify-nova customer-panel orion-backend events-consumer storefront; do
TASK=$(aws ecs list-tasks --cluster $CLUSTER --service-name $svc \
--desired-status RUNNING --region $REGION --query 'taskArns[0]' --output text 2>/dev/null)
if [ -n "$TASK" ] && [ "$TASK" != "None" ]; then
IMG=$(aws ecs describe-tasks --cluster $CLUSTER --tasks $TASK --region $REGION \
--query 'tasks[0].containers[0].image' --output text 2>/dev/null)
echo "$svc: $IMG"
else
echo "$svc: NOT RUNNING"
fi
done