MLOps - Apache Airflow
By Bys on March 7, 2025
MLOps
Apache Airflow
Install Airflow(Official)
ingress:
web:
enabled: true
annotations:
alb.ingress.kubernetes.io/group.name: mlops
alb.ingress.kubernetes.io/subnets: bys-dev-ue1-sbn-1a-extelb, bys-dev-ue1-sbn-1b-extelb, bys-dev-ue1-sbn-1c-extelb, bys-dev-ue1-sbn-1d-extelb, bys-dev-ue1-sbn-1f-extelb
alb.ingress.kubernetes.io/scheme : internet-facing
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS": 443}]'
alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:us-east-1:558846430793:certificate/a5207b24-ae67-49ac-b34e-f34ed0088bca
alb.ingress.kubernetes.io/security-groups: sg-07e6c272df0bed7ee
alb.ingress.kubernetes.io/healthcheck-path: /
alb.ingress.kubernetes.io/healthcheck-interval-seconds: '15'
alb.ingress.kubernetes.io/healthcheck-timeout-seconds: '10'
alb.ingress.kubernetes.io/healthy-threshold-count: '2'
alb.ingress.kubernetes.io/unhealthy-threshold-count: '4'
alb.ingress.kubernetes.io/healthcheck-port: traffic-port
alb.ingress.kubernetes.io/success-codes: 200,302,401
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/tags: auto-delete=no
path: "/"
pathType: "Prefix"
hosts:
- name: airflow.bys.asia
ingressClassName: "alb"
webserver:
defaultUser:
enabled: true
role: Admin
username: admin
password: admin
email: admin@test.com
firstName: bys
lastName: ko
resources:
requests:
cpu: 1000m
memory: 2000Mi
limits:
cpu: 2000m
memory: 2000Mi
workers:
replicas: 2
resources:
requests:
cpu: 1500m
memory: 4000Mi
limits:
cpu: 1500m
memory: 4000Mi
scheduler:
resources:
requests:
cpu: 400m
memory: 1000Mi
limits:
cpu: 1000m
memory: 1000Mi
triggerer:
resources:
requests:
cpu: 400m
memory: 1000Mi
limits:
cpu: 1000m
memory: 1000Mi
dags:
gitSync:
enabled: true
repo: https://github.com/byoungsoo/airflow-dags.git
branch: main
rev: HEAD
# The git revision (branch, tag, or hash) to check out, v4 only
ref: main
depth: 1
maxFailures: 0
subPath: ""
credentialsSecret: git-credentials
helm repo add apache-airflow https://airflow.apache.org
helm repo update
helm upgrade -i airflow apache-airflow/airflow -n airflow -f /Users/bys/workspace/kubernetes/mlops/airflow/values.yaml
helm delete airflow -n airflow
Git Sync
apiVersion: v1
kind: Secret
metadata:
name: git-credentials
namespace: airflow
data:
# For git-sync v4
GIT_SYNC_USERNAME: YnlvdW5nc29v
GIT_SYNC_PASSWORD: Z2l0aHViX3BhdF8xMUFCS0VYV1EwU2FqQlNSY1lsR2hTX1lNTHBTU3poNEwxcDFpa2h6ZFFQcnRDVHJ3RXVEazVOV0ZtR1Z5RXpRV3RLWDM2U1hUTTVXWVVzTFFx
GITSYNC_USERNAME: YnlvdW5nc29v
GITSYNC_PASSWORD: Z2l0aHViX3BhdF8xMUFCS0VYV1EwU2FqQlNSY1lsR2hTX1lNTHBTU3poNEwxcDFpa2h6ZFFQcnRDVHJ3RXVEazVOV0ZtR1Z5RXpRV3RLWDM2U1hUTTVXWVVzTFFx
Trouble shooting
1. Node 상태 NotReady로 변경
Node status became NotReady
and pods running on the node changed to Terminating
status
From using source in CloudWatch, checked average IOPS for root volume and found out that IOPS were using up to 100%.
So, I couldn’t access node using SSM agent and kubelet didn’t work properly, so pods were in Terminating
status.
{
"metrics": [
[ "AWS/EBS", "VolumeQueueLength", "VolumeId", "<volume-id>", { "id": "m1" } ],
[ ".", "VolumeIdleTime", ".", ".", { "id": "m2", "visible": false } ],
[ ".", "VolumeReadOps", ".", ".", { "id": "m3", "visible": false } ],
[ ".", "VolumeWriteOps", ".", ".", { "id": "m4", "visible": false } ],
[ ".", "VolumeReadBytes", ".", ".", { "id": "m5", "visible": false } ],
[ ".", "VolumeWriteBytes", ".", ".", { "id": "m6", "visible": false } ],
[ ".", "VolumeTotalReadTime", ".", ".", { "id": "m7", "visible": false } ],
[ ".", "VolumeTotalWriteTime", ".", ".", { "id": "m8", "visible": false } ],
[ ".", "BurstBalance", ".", ".", { "id": "m9", "visible": false } ],
[ { "expression": "100-((m2/PERIOD(m2))*100)", "label": "IO Use Volume %", "id": "io_use", "visible": false } ],
[ { "expression": "IF(m3!=0,(m7/m3)*1000, 0)", "label": "Average Read latency (ms)", "id": "r_lt", "visible": false } ],
[ { "expression": "IF(m4!=0,(m8/m4)*1000, 0)", "label": "Average Write latency (ms)", "id": "w_lt", "visible": false } ],
[ { "expression": "IF(m3!=0, (m5/m3)/1024, 0)", "label": "Read IO Size (KiB)", "id": "r_sz", "visible": false } ],
[ { "expression": "IF(m4!=0, (m6/m4)/1024, 0)", "label": "Write IO Size (KiB)", "id": "w_sz", "visible": false } ],
[ { "expression": "m3/PERIOD(m3)", "label": "Average Read IOPS", "id": "r_iops", "visible": false } ],
[ { "expression": "m4/PERIOD(m4)", "label": "Average Write IOPS", "id": "w_iops", "visible": false } ],
[ { "expression": "r_iops+w_iops", "label": "Average Total IOPS", "id": "t_iops", "visible": false } ],
[ { "expression": "(m5/PERIOD(m5))/1024^2", "label": "Average Read Throughput (MiB/s)", "id": "r_tp", "visible": false } ],
[ { "expression": "(m6/PERIOD(m6))/1024^2", "label": "Average Write Throughput (MiB/s)", "id": "w_tp", "visible": false } ],
[ { "expression": "r_tp+w_tp", "label": "Average Total Throughput (MiB/s)", "id": "t_tp", "visible": false } ],
[ { "expression": "(m3)/(PERIOD(m2)-m2)", "label": "Bursting Read IOPS", "id": "rb_iops", "visible": false } ],
[ { "expression": "(m4)/(PERIOD(m2)-m2)", "label": "Bursting Write IOPS", "id": "wb_iops", "visible": false } ],
[ { "expression": "rb_iops+wb_iops", "label": "Bursting Total IOPS", "id": "tb_iops", "visible": false } ],
[ { "expression": "((m5)/(PERIOD(m2)-m2))/1024^2", "label": "Bursting Read Throughput (MiB/s)", "id": "rb_tp", "visible": false } ],
[ { "expression": "((m6)/(PERIOD(m2)-m2))/1024^2", "label": "Bursting Write Throughput (MiB/s)", "id": "wb_tp", "visible": false } ],
[ { "expression": "rb_tp+wb_tp", "label": "Bursting Total Throughput (MiB/s)", "id": "tb_tp", "visible": false } ]
],
"view": "timeSeries",
"stacked": false,
"stat": "Sum",
"period": 60
}
mlops
airflow
]