MLOps - Apache Airflow


MLOps

Apache Airflow

Install Airflow(Official)

values.yaml

ingress:
  web:
    enabled: true
    annotations:
      alb.ingress.kubernetes.io/group.name: mlops
      alb.ingress.kubernetes.io/subnets: bys-dev-ue1-sbn-1a-extelb, bys-dev-ue1-sbn-1b-extelb, bys-dev-ue1-sbn-1c-extelb, bys-dev-ue1-sbn-1d-extelb, bys-dev-ue1-sbn-1f-extelb
      alb.ingress.kubernetes.io/scheme : internet-facing
      alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS": 443}]'
      alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:us-east-1:558846430793:certificate/a5207b24-ae67-49ac-b34e-f34ed0088bca
      alb.ingress.kubernetes.io/security-groups: sg-07e6c272df0bed7ee
      alb.ingress.kubernetes.io/healthcheck-path: /
      alb.ingress.kubernetes.io/healthcheck-interval-seconds: '15'
      alb.ingress.kubernetes.io/healthcheck-timeout-seconds: '10'
      alb.ingress.kubernetes.io/healthy-threshold-count: '2'
      alb.ingress.kubernetes.io/unhealthy-threshold-count: '4'
      alb.ingress.kubernetes.io/healthcheck-port: traffic-port
      alb.ingress.kubernetes.io/success-codes: 200,302,401
      alb.ingress.kubernetes.io/target-type: ip
      alb.ingress.kubernetes.io/tags: auto-delete=no
    path: "/"
    pathType: "Prefix"
    hosts:
    - name: airflow.bys.asia
    ingressClassName: "alb"
webserver:
  defaultUser:
    enabled: true
    role: Admin
    username: admin
    password: admin
    email: admin@test.com
    firstName: bys
    lastName: ko
  resources:
    requests:
     cpu: 1000m
     memory: 2000Mi
    limits:
     cpu: 2000m
     memory: 2000Mi
workers:
  replicas: 2
  resources:
    requests:
     cpu: 1500m
     memory: 4000Mi
    limits:
     cpu: 1500m
     memory: 4000Mi
scheduler:
  resources:
    requests:
     cpu: 400m
     memory: 1000Mi
    limits:
     cpu: 1000m
     memory: 1000Mi
triggerer:
  resources:
    requests:
     cpu: 400m
     memory: 1000Mi
    limits:
     cpu: 1000m
     memory: 1000Mi
dags:
  gitSync:
    enabled: true
    repo: https://github.com/byoungsoo/airflow-dags.git
    branch: main
    rev: HEAD
    # The git revision (branch, tag, or hash) to check out, v4 only
    ref: main
    depth: 1
    maxFailures: 0
    subPath: ""
    credentialsSecret: git-credentials
helm repo add apache-airflow https://airflow.apache.org
helm repo update

helm upgrade -i airflow apache-airflow/airflow -n airflow -f /Users/bys/workspace/kubernetes/mlops/airflow/values.yaml
helm delete airflow -n airflow



Git Sync

apiVersion: v1
kind: Secret
metadata:
  name: git-credentials
  namespace: airflow
data:
  # For git-sync v4
  GIT_SYNC_USERNAME: YnlvdW5nc29v
  GIT_SYNC_PASSWORD: Z2l0aHViX3BhdF8xMUFCS0VYV1EwU2FqQlNSY1lsR2hTX1lNTHBTU3poNEwxcDFpa2h6ZFFQcnRDVHJ3RXVEazVOV0ZtR1Z5RXpRV3RLWDM2U1hUTTVXWVVzTFFx
  GITSYNC_USERNAME: YnlvdW5nc29v
  GITSYNC_PASSWORD: Z2l0aHViX3BhdF8xMUFCS0VYV1EwU2FqQlNSY1lsR2hTX1lNTHBTU3poNEwxcDFpa2h6ZFFQcnRDVHJ3RXVEazVOV0ZtR1Z5RXpRV3RLWDM2U1hUTTVXWVVzTFFx

Trouble shooting

1. Node 상태 NotReady로 변경

Node status became NotReady and pods running on the node changed to Terminating status

From using source in CloudWatch, checked average IOPS for root volume and found out that IOPS were using up to 100%. So, I couldn’t access node using SSM agent and kubelet didn’t work properly, so pods were in Terminating status.

{
    "metrics": [
        [ "AWS/EBS", "VolumeQueueLength", "VolumeId", "<volume-id>", { "id": "m1" } ],
        [ ".", "VolumeIdleTime", ".", ".", { "id": "m2", "visible": false } ],
        [ ".", "VolumeReadOps", ".", ".", { "id": "m3", "visible": false } ],
        [ ".", "VolumeWriteOps", ".", ".", { "id": "m4", "visible": false } ],
        [ ".", "VolumeReadBytes", ".", ".", { "id": "m5", "visible": false } ],
        [ ".", "VolumeWriteBytes", ".", ".", { "id": "m6", "visible": false } ],
        [ ".", "VolumeTotalReadTime", ".", ".", { "id": "m7", "visible": false } ],
        [ ".", "VolumeTotalWriteTime", ".", ".", { "id": "m8", "visible": false } ],
        [ ".", "BurstBalance", ".", ".", { "id": "m9", "visible": false } ],
        [ { "expression": "100-((m2/PERIOD(m2))*100)", "label": "IO Use Volume %", "id": "io_use", "visible": false } ],
        [ { "expression": "IF(m3!=0,(m7/m3)*1000, 0)", "label": "Average Read latency (ms)", "id": "r_lt", "visible": false } ],
        [ { "expression": "IF(m4!=0,(m8/m4)*1000, 0)", "label": "Average Write latency (ms)", "id": "w_lt", "visible": false } ],
        [ { "expression": "IF(m3!=0, (m5/m3)/1024, 0)", "label": "Read IO Size (KiB)", "id": "r_sz", "visible": false } ],
        [ { "expression": "IF(m4!=0, (m6/m4)/1024, 0)", "label": "Write IO Size (KiB)", "id": "w_sz", "visible": false } ],
        [ { "expression": "m3/PERIOD(m3)", "label": "Average Read IOPS", "id": "r_iops", "visible": false } ],
        [ { "expression": "m4/PERIOD(m4)", "label": "Average Write IOPS", "id": "w_iops", "visible": false } ],
        [ { "expression": "r_iops+w_iops", "label": "Average Total IOPS", "id": "t_iops", "visible": false } ],
        [ { "expression": "(m5/PERIOD(m5))/1024^2", "label": "Average Read Throughput (MiB/s)", "id": "r_tp", "visible": false } ],
        [ { "expression": "(m6/PERIOD(m6))/1024^2", "label": "Average Write Throughput (MiB/s)", "id": "w_tp", "visible": false } ],
        [ { "expression": "r_tp+w_tp", "label": "Average Total Throughput (MiB/s)", "id": "t_tp", "visible": false } ],
        [ { "expression": "(m3)/(PERIOD(m2)-m2)", "label": "Bursting Read IOPS", "id": "rb_iops", "visible": false } ],
        [ { "expression": "(m4)/(PERIOD(m2)-m2)", "label": "Bursting Write IOPS", "id": "wb_iops", "visible": false } ],
        [ { "expression": "rb_iops+wb_iops", "label": "Bursting Total IOPS", "id": "tb_iops", "visible": false } ],
        [ { "expression": "((m5)/(PERIOD(m2)-m2))/1024^2", "label": "Bursting Read Throughput (MiB/s)", "id": "rb_tp", "visible": false } ],
        [ { "expression": "((m6)/(PERIOD(m2)-m2))/1024^2", "label": "Bursting Write Throughput (MiB/s)", "id": "wb_tp", "visible": false } ],
        [ { "expression": "rb_tp+wb_tp", "label": "Bursting Total Throughput (MiB/s)", "id": "tb_tp", "visible": false } ]
    ],
    "view": "timeSeries",
    "stacked": false,
    "stat": "Sum",
    "period": 60
}

volume_metrics

Tag: [ mlops  airflow  ]