AEWS 11์ฃผ์ฐจ ์ ๋ฆฌ
๐ Getting Started: GenAI with Inferentia & FSx Workshop
1. AWS Cloud9 ์ด๋
genaifsxworkshoponeks
Cloud9 IDE Open ํด๋ฆญ
2. New Terminal ์์ฑ
Cloud9 IDE ํ๋ฉด ๋ก๋ ํ ์๋จ ํญ์ย (+)ย ๋ฒํผย >ย ์ ํฐ๋ฏธ๋ ํด๋ฆญํ์ฌ ํฐ๋ฏธ๋ ์์ฑ
3. ์๊ฒฉ ์ฆ๋ช ๋นํ์ฑํ
1
2
3
4
5
WSParticipantRole:~/environment $ aws cloud9 update-environment --environment-id ${C9_PID} --managed-credentials-action DISABLE
WSParticipantRole:~/environment $ rm -vf ${HOME}/.aws/credentials
# ๊ฒฐ๊ณผ
removed '/home/ec2-user/.aws/credentials'
4. IAM ์ญํ ํ์ธ
1
2
3
4
5
6
7
WSParticipantRole:~/environment $ aws sts get-caller-identity
{
"UserId": "XXXXXXXXXXXXXXXXXXXXXX:x-xxxxxxxxxxxxxxxxxx",
"Account": "xxxxxxxxxxxxx",
"Arn": "arn:aws:sts::xxxxxxxxxxxxx:assumed-role/genaifsxworkshoponeks-C9Role-NsrVsrgsvUf3/x-xxxxxxxxxxxxxxxxxx"
}
5. ๋ฆฌ์ ๋ฐ EKS ํด๋ฌ์คํฐ๋ช ์ค์
1
2
WSParticipantRole:~/environment $ TOKEN=`curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"`
WSParticipantRole:~/environment $ export AWS_REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region)
1
WSParticipantRole:~/environment $ export CLUSTER_NAME=eksworkshop
6. ๋ฆฌ์ ๋ฐ EKS ํด๋ฌ์คํฐ๋ช ํ์ธ
1
2
3
4
5
WSParticipantRole:~/environment $ echo $AWS_REGION
us-west-2
WSParticipantRole:~/environment $ echo $CLUSTER_NAME
eksworkshop
7. kube-config ์ ๋ฐ์ดํธ
1
2
3
4
WSParticipantRole:~/environment $ aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
# ๊ฒฐ๊ณผ
Added new context arn:aws:eks:us-west-2:xxxxxxxxxxxxx:cluster/eksworkshop to /home/ec2-user/.kube/config
8. EKS ๋ ธ๋ ์ํ ํ์ธ
1
2
3
4
WSParticipantRole:~/environment $ kubectl get nodes
NAME STATUS ROLES AGE VERSION
ip-10-0-108-71.us-west-2.compute.internal Ready <none> 47h v1.30.9-eks-5d632ec
ip-10-0-67-165.us-west-2.compute.internal Ready <none> 47h v1.30.9-eks-5d632ec
9. Karpenter Deployment ์กฐํ
1
WSParticipantRole:~/environment $ kubectl -n karpenter get deploy/karpenter -o yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
meta.helm.sh/release-name: karpenter
meta.helm.sh/release-namespace: karpenter
creationTimestamp: "2025-04-17T02:22:13Z"
generation: 1
labels:
app.kubernetes.io/instance: karpenter
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: karpenter
app.kubernetes.io/version: 1.0.1
helm.sh/chart: karpenter-1.0.1
name: karpenter
namespace: karpenter
resourceVersion: "3161"
uid: 51294e16-448c-48f8-93ce-179331c7e2ca
spec:
progressDeadlineSeconds: 600
replicas: 2
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/instance: karpenter
app.kubernetes.io/name: karpenter
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
app.kubernetes.io/instance: karpenter
app.kubernetes.io/name: karpenter
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: karpenter.sh/nodepool
operator: DoesNotExist
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/instance: karpenter
app.kubernetes.io/name: karpenter
topologyKey: kubernetes.io/hostname
containers:
- env:
- name: KUBERNETES_MIN_VERSION
value: 1.19.0-0
- name: KARPENTER_SERVICE
value: karpenter
- name: WEBHOOK_PORT
value: "8443"
- name: WEBHOOK_METRICS_PORT
value: "8001"
- name: DISABLE_WEBHOOK
value: "false"
- name: LOG_LEVEL
value: info
- name: METRICS_PORT
value: "8080"
- name: HEALTH_PROBE_PORT
value: "8081"
- name: SYSTEM_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: MEMORY_LIMIT
valueFrom:
resourceFieldRef:
containerName: controller
divisor: "0"
resource: limits.memory
- name: FEATURE_GATES
value: SpotToSpotConsolidation=false
- name: BATCH_MAX_DURATION
value: 10s
- name: BATCH_IDLE_DURATION
value: 1s
- name: CLUSTER_NAME
value: eksworkshop
- name: CLUSTER_ENDPOINT
value: https://53E5113441C691170249AE781B50CCEE.gr7.us-west-2.eks.amazonaws.com
- name: VM_MEMORY_OVERHEAD_PERCENT
value: "0.075"
- name: INTERRUPTION_QUEUE
value: karpenter-eksworkshop
- name: RESERVED_ENIS
value: "0"
image: public.ecr.aws/karpenter/controller:1.0.1@sha256:fc54495b35dfeac6459ead173dd8452ca5d572d90e559f09536a494d2795abe6
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: http
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
name: controller
ports:
- containerPort: 8080
name: http-metrics
protocol: TCP
- containerPort: 8001
name: webhook-metrics
protocol: TCP
- containerPort: 8443
name: https-webhook
protocol: TCP
- containerPort: 8081
name: http
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
resources: {}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
restartPolicy: Always
schedulerName: default-scheduler
securityContext:
fsGroup: 65532
serviceAccount: karpenter
serviceAccountName: karpenter
terminationGracePeriodSeconds: 30
tolerations:
- key: CriticalAddonsOnly
operator: Exists
topologySpreadConstraints:
- labelSelector:
matchLabels:
app.kubernetes.io/instance: karpenter
app.kubernetes.io/name: karpenter
maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
status:
availableReplicas: 2
conditions:
- lastTransitionTime: "2025-04-17T02:22:23Z"
lastUpdateTime: "2025-04-17T02:22:23Z"
message: Deployment has minimum availability.
reason: MinimumReplicasAvailable
status: "True"
type: Available
- lastTransitionTime: "2025-04-17T02:22:13Z"
lastUpdateTime: "2025-04-17T02:22:33Z"
message: ReplicaSet "karpenter-86d7868f9f" has successfully progressed.
reason: NewReplicaSetAvailable
status: "True"
type: Progressing
observedGeneration: 1
readyReplicas: 2
replicas: 2
updatedReplicas: 2
- https://karpenter.sh/docs/reference/settings/
10. Karpenter Pod ์ํ ํ์ธ
1
WSParticipantRole:~/environment $ kubectl get pods --namespace karpenter
โ ย ์ถ๋ ฅ
1
2
3
NAME READY STATUS RESTARTS AGE
karpenter-86d7868f9f-2dmfd 1/1 Running 0 47h
karpenter-86d7868f9f-zjbl8 1/1 Running 0 47h
11. Karpenter ๋ก๊ทธ ์คํธ๋ฆฌ๋ฐ
ํฐ๋ฏธ๋์์ ๋ณ์นญ ์ค์ ํ, Karpenter ์ปจํธ๋กค๋ฌ ๋ก๊ทธ ์คํธ๋ฆฌ๋ฐ
1
2
WSParticipantRole:~/environment $ alias kl='kubectl -n karpenter logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20'
WSParticipantRole:~/environment $ kl
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
{"level":"INFO","time":"2025-04-17T02:22:21.024Z","logger":"controller.controller-runtime.metrics","message":"Starting metrics server","commit":"62a726c"}
{"level":"INFO","time":"2025-04-17T02:22:21.025Z","logger":"controller","message":"starting server","commit":"62a726c","name":"health probe","addr":"[::]:8081"}
{"level":"INFO","time":"2025-04-17T02:22:21.025Z","logger":"controller.controller-runtime.metrics","message":"Serving metrics server","commit":"62a726c","bindAddress":":8080","secure":false}
{"level":"INFO","time":"2025-04-17T02:22:21.126Z","logger":"controller","message":"attempting to acquire leader lease karpenter/karpenter-leader-election...","commit":"62a726c"}
{"level":"INFO","time":"2025-04-17T02:22:19.397Z","logger":"controller","message":"Starting workers","commit":"62a726c","controller":"nodeclaim.consistency","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","worker count":10}
{"level":"INFO","time":"2025-04-17T02:22:19.397Z","logger":"controller","message":"Starting workers","commit":"62a726c","controller":"nodeclaim.disruption","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","worker count":10}
{"level":"ERROR","time":"2025-04-17T02:22:19.663Z","logger":"webhook.ConversionWebhook","message":"Reconcile error","commit":"62a726c","knative.dev/traceid":"85caea6f-8f7a-4cd3-9dcc-642a21a15959","knative.dev/key":"nodeclaims.karpenter.sh","duration":"154.257731ms","error":"failed to update webhook: Operation cannot be fulfilled on customresourcedefinitions.apiextensions.k8s.io \"nodeclaims.karpenter.sh\": the object has been modified; please apply your changes to the latest version and try again"}
{"level":"ERROR","time":"2025-04-17T02:22:19.686Z","logger":"webhook.ConversionWebhook","message":"Reconcile error","commit":"62a726c","knative.dev/traceid":"72b3cf48-22e3-410a-9667-962c8534b26d","knative.dev/key":"nodepools.karpenter.sh","duration":"97.713552ms","error":"failed to update webhook: Operation cannot be fulfilled on customresourcedefinitions.apiextensions.k8s.io \"nodepools.karpenter.sh\": the object has been modified; please apply your changes to the latest version and try again"}
{"level":"INFO","time":"2025-04-17T02:23:05.822Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"sysprep"},"namespace":"","name":"sysprep","reconcileID":"859590e8-76f0-4884-9c38-ac6378cc40c3","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2/amazon-eks-node-1.30-v20240917/image_id","value":"ami-05f7e80c30f28d8b9"}
{"level":"INFO","time":"2025-04-17T02:23:05.852Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"sysprep"},"namespace":"","name":"sysprep","reconcileID":"859590e8-76f0-4884-9c38-ac6378cc40c3","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2-arm64/amazon-eks-arm64-node-1.30-v20240917/image_id","value":"ami-0b402b9a4c1bacaa5"}
{"level":"INFO","time":"2025-04-17T02:23:05.881Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"sysprep"},"namespace":"","name":"sysprep","reconcileID":"859590e8-76f0-4884-9c38-ac6378cc40c3","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/amazon-eks-gpu-node-1.30-v20240917/image_id","value":"ami-0356f40aea17e9b9e"}
{"level":"INFO","time":"2025-04-17T02:23:07.175Z","logger":"controller","message":"found provisionable pod(s)","commit":"62a726c","controller":"provisioner","namespace":"","name":"","reconcileID":"869e7086-1212-4127-b4b3-94480c1680cf","Pods":"default/sysprep-l4wpv","duration":"350.444011ms"}
{"level":"INFO","time":"2025-04-17T02:23:07.175Z","logger":"controller","message":"computed new nodeclaim(s) to fit pod(s)","commit":"62a726c","controller":"provisioner","namespace":"","name":"","reconcileID":"869e7086-1212-4127-b4b3-94480c1680cf","nodeclaims":1,"pods":1}
{"level":"INFO","time":"2025-04-17T02:23:07.186Z","logger":"controller","message":"created nodeclaim","commit":"62a726c","controller":"provisioner","namespace":"","name":"","reconcileID":"869e7086-1212-4127-b4b3-94480c1680cf","NodePool":{"name":"sysprep"},"NodeClaim":{"name":"sysprep-r6fgf"},"requests":{"cpu":"180m","memory":"120Mi","pods":"9"},"instance-types":"c5.large, c5.xlarge, c5a.large, c5a.xlarge, c5ad.large and 55 other(s)"}
{"level":"INFO","time":"2025-04-17T02:23:13.205Z","logger":"controller","message":"launched nodeclaim","commit":"62a726c","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"sysprep-r6fgf"},"namespace":"","name":"sysprep-r6fgf","reconcileID":"f2b2ec5c-bcf7-4402-a919-a8280a1e9cac","provider-id":"aws:///us-west-2a/i-0dd7235d205d3547d","instance-type":"c6i.large","zone":"us-west-2a","capacity-type":"on-demand","allocatable":{"cpu":"1930m","ephemeral-storage":"89Gi","memory":"3114Mi","pods":"29","vpc.amazonaws.com/pod-eni":"9"}}
{"level":"INFO","time":"2025-04-17T02:23:30.109Z","logger":"controller","message":"registered nodeclaim","commit":"62a726c","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"sysprep-r6fgf"},"namespace":"","name":"sysprep-r6fgf","reconcileID":"8f45ee82-92e7-47b2-92e7-43449a500f45","provider-id":"aws:///us-west-2a/i-0dd7235d205d3547d","Node":{"name":"ip-10-0-43-213.us-west-2.compute.internal"}}
{"level":"INFO","time":"2025-04-17T02:23:41.271Z","logger":"controller","message":"initialized nodeclaim","commit":"62a726c","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"sysprep-r6fgf"},"namespace":"","name":"sysprep-r6fgf","reconcileID":"5a63fb88-bcb2-4e50-87b2-30ffbd131697","provider-id":"aws:///us-west-2a/i-0dd7235d205d3547d","Node":{"name":"ip-10-0-43-213.us-west-2.compute.internal"},"allocatable":{"cpu":"1930m","ephemeral-storage":"95551679124","hugepages-1Gi":"0","hugepages-2Mi":"0","memory":"3232656Ki","pods":"29"}}
{"level":"INFO","time":"2025-04-17T02:33:56.588Z","logger":"controller","message":"tainted node","commit":"62a726c","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-43-213.us-west-2.compute.internal"},"namespace":"","name":"ip-10-0-43-213.us-west-2.compute.internal","reconcileID":"e8b2aa8b-bb75-4566-9aa1-8c787ae1b7bf","taint.Key":"karpenter.sh/disrupted","taint.Value":"","taint.Effect":"NoSchedule"}
{"level":"INFO","time":"2025-04-17T02:34:23.083Z","logger":"controller","message":"deleted node","commit":"62a726c","controller":"node.termination","controllerGroup":"","controllerKind":"Node","Node":{"name":"ip-10-0-43-213.us-west-2.compute.internal"},"namespace":"","name":"ip-10-0-43-213.us-west-2.compute.internal","reconcileID":"90b9486f-add1-444b-872c-1fc3d748f6a8"}
{"level":"INFO","time":"2025-04-17T02:34:23.320Z","logger":"controller","message":"deleted nodeclaim","commit":"62a726c","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"sysprep-r6fgf"},"namespace":"","name":"sysprep-r6fgf","reconcileID":"0260c674-f67d-4d57-84e0-b1c9db150229","Node":{"name":"ip-10-0-43-213.us-west-2.compute.internal"},"provider-id":"aws:///us-west-2a/i-0dd7235d205d3547d"}
{"level":"ERROR","time":"2025-04-17T14:35:52.758Z","logger":"controller","message":"Failed to update lock optimitically: etcdserver: request timed out, falling back to slow path","commit":"62a726c"}
{"level":"INFO","time":"2025-04-18T02:23:23.974Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"fc30a0be-2ef1-47bd-a48b-b6d491061241","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2/amazon-eks-node-1.30-v20240917/image_id","value":"ami-05f7e80c30f28d8b9"}
{"level":"INFO","time":"2025-04-18T02:23:24.000Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"fc30a0be-2ef1-47bd-a48b-b6d491061241","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2-arm64/amazon-eks-arm64-node-1.30-v20240917/image_id","value":"ami-0b402b9a4c1bacaa5"}
{"level":"INFO","time":"2025-04-18T02:23:24.016Z","logger":"controller","message":"discovered ssm parameter","commit":"62a726c","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"fc30a0be-2ef1-47bd-a48b-b6d491061241","parameter":"/aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/amazon-eks-gpu-node-1.30-v20240917/image_id","value":"ami-0356f40aea17e9b9e"}
control + c
: ์ข
๋ฃ
1
^C
๐๏ธ Configure storage - Host model data on Amazon FSx for Lustre
Amazon FSx for Lustre
- https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html
- ์์ ๊ด๋ฆฌํ ๊ณ ์ฑ๋ฅ ํ์ผ ์์คํ ์ ๊ณต: Machine Learning, ๋ถ์, HPC ๋ฑ ์๋๊ฐ ์ค์ํ ์ํฌ๋ก๋์ ์ ํฉ
- ๋ฐ๋ฆฌ์ด ์ดํ ์ง์ฐ ์๊ฐ ๋ฐ ๋๊ท๋ชจ ์ฑ๋ฅ ํ์ฅ:ย TB/s ์ฒ๋ฆฌ๋๊ณผ ์๋ฐฑ๋ง IOPS๋ก ํ์ฅ ์ง์
- Amazon S3 ์ฐ๋: S3 ๊ฐ์ฒด๋ฅผ ํ์ผ๋ก ์ ๊ณต, ๋ณ๊ฒฝ ์ฌํญ ์๋ ๋๊ธฐํ
CSI ๋๋ผ์ด๋ฒ ๋ฐฐํฌ
1. account-id ํ๊ฒฝ๋ณ์ ์ค์
1
WSParticipantRole:~/environment $ ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
2. IAM ์๋น์ค ๊ณ์ ์์ฑ
AWS API ํธ์ถ ๊ถํ์ ๊ฐ์ง IAM ์ ์ฑ ์์ฑ ํ ์๋น์ค ๊ณ์ ์ ์ฐ๊ฒฐ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
WSParticipantRole:~/environment $ cat << EOF > fsx-csi-driver.json
> {
> "Version":"2012-10-17",
> "Statement":[
> {
> "Effect":"Allow",
> "Action":[
> "iam:CreateServiceLinkedRole",
> "iam:AttachRolePolicy",
> "iam:PutRolePolicy"
> ],
> "Resource":"arn:aws:iam::*:role/aws-service-role/s3.data-source.lustre.fsx.amazonaws.com/*"
> },
> {
> "Action":"iam:CreateServiceLinkedRole",
> "Effect":"Allow",
> "Resource":"*",
> "Condition":{
> "StringLike":{
> "iam:AWSServiceName":[
> "fsx.amazonaws.com"
> ]
> }
> }
> },
> {
> "Effect":"Allow",
> "Action":[
> "s3:ListBucket",
> "fsx:CreateFileSystem",
> "fsx:DeleteFileSystem",
> "fsx:DescribeFileSystems",
> "fsx:TagResource"
> ],
> "Resource":[
> "*"
> ]
> }
> ]
> }
> EOF
3. IAM ์ ์ฑ ์์ฑ
1
2
3
WSParticipantRole:~/environment $ aws iam create-policy \
> --policy-name Amazon_FSx_Lustre_CSI_Driver \
> --policy-document file://fsx-csi-driver.json
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
{
"Policy": {
"PolicyName": "Amazon_FSx_Lustre_CSI_Driver",
"PolicyId": "ANPATNWURYOGXHAZW6A6U",
"Arn": "arn:aws:iam::xxxxxxxxxxxxx:policy/Amazon_FSx_Lustre_CSI_Driver",
"Path": "/",
"DefaultVersionId": "v1",
"AttachmentCount": 0,
"PermissionsBoundaryUsageCount": 0,
"IsAttachable": true,
"CreateDate": "2025-04-19T02:18:02+00:00",
"UpdateDate": "2025-04-19T02:18:02+00:00"
}
}
4. ์๋น์ค ๊ณ์ ์์ฑ ๋ฐ ์ ์ฑ ์ฐ๊ฒฐ
์๋ ๋ช ๋ น ์คํํด ๋๋ผ์ด๋ฒ์ฉ ์๋น์ค ๊ณ์ ์์ฑ ํ 3๋จ๊ณ IAM ์ ์ฑ ์ฐ๊ฒฐ
1
2
3
4
5
6
7
WSParticipantRole:~/environment $ eksctl create iamserviceaccount \
> --region $AWS_REGION \
> --name fsx-csi-controller-sa \
> --namespace kube-system \
> --cluster $CLUSTER_NAME \
> --attach-policy-arn arn:aws:iam::$ACCOUNT_ID:policy/Amazon_FSx_Lustre_CSI_Driver \
> --approve
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
2025-04-19 02:19:20 [โน] 1 iamserviceaccount (kube-system/fsx-csi-controller-sa) was included (based on the include/exclude rules)
2025-04-19 02:19:20 [!] serviceaccounts that exist in Kubernetes will be excluded, use --override-existing-serviceaccounts to override
2025-04-19 02:19:20 [โน] 1 task: {
2 sequential sub-tasks: {
create IAM role for serviceaccount "kube-system/fsx-csi-controller-sa",
create serviceaccount "kube-system/fsx-csi-controller-sa",
} }2025-04-19 02:19:20 [โน] building iamserviceaccount stack "eksctl-eksworkshop-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa"
2025-04-19 02:19:20 [โน] deploying stack "eksctl-eksworkshop-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa"
2025-04-19 02:19:20 [โน] waiting for CloudFormation stack "eksctl-eksworkshop-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa"
2025-04-19 02:19:50 [โน] waiting for CloudFormation stack "eksctl-eksworkshop-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa"
2025-04-19 02:19:50 [โน] created serviceaccount "kube-system/fsx-csi-controller-sa"
5. ์ญํ (Role) ARN ์ ์ฅ
1
2
WSParticipantRole:~/environment $ export ROLE_ARN=$(aws cloudformation describe-stacks --stack-name "eksctl-${CLUSTER_NAME}-addon-iamserviceaccount-kube-system-fsx-csi-controller-sa" --query "Stacks[0].Outputs[0].OutputValue" --region $AWS_REGION --output text)
WSParticipantRole:~/environment $ echo $ROLE_ARN
โ ย ์ถ๋ ฅ
1
arn:aws:iam::xxxxxxxxxxxxx:role/eksctl-eksworkshop-addon-iamserviceaccount-ku-Role1-eALfDkUyCxWi
6. Lustre์ฉ FSx CSI ๋๋ผ์ด๋ฒ ๋ฐฐํฌ
1
WSParticipantRole:~/environment $ kubectl apply -k "github.com/kubernetes-sigs/aws-fsx-csi-driver/deploy/kubernetes/overlays/stable/?ref=release-1.2"
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
# Warning: 'bases' is deprecated. Please use 'resources' instead. Run 'kustomize edit fix' to update your Kustomization automatically.
Warning: resource serviceaccounts/fsx-csi-controller-sa is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
serviceaccount/fsx-csi-controller-sa configured
serviceaccount/fsx-csi-node-sa created
clusterrole.rbac.authorization.k8s.io/fsx-csi-external-provisioner-role created
clusterrole.rbac.authorization.k8s.io/fsx-csi-node-role created
clusterrole.rbac.authorization.k8s.io/fsx-external-resizer-role created
clusterrolebinding.rbac.authorization.k8s.io/fsx-csi-external-provisioner-binding created
clusterrolebinding.rbac.authorization.k8s.io/fsx-csi-node-getter-binding created
clusterrolebinding.rbac.authorization.k8s.io/fsx-csi-resizer-binding created
deployment.apps/fsx-csi-controller created
daemonset.apps/fsx-csi-node created
csidriver.storage.k8s.io/fsx.csi.aws.com created
CSI ๋๋ผ์ด๋ฒ ์ค์น ํ์ธ
1
WSParticipantRole:~/environment $ kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-fsx-csi-driver
โ ย ์ถ๋ ฅ
1
2
3
4
5
NAME READY STATUS RESTARTS AGE
fsx-csi-controller-6f4c577bd4-4wxrs 4/4 Running 0 38s
fsx-csi-controller-6f4c577bd4-r6kgd 4/4 Running 0 38s
fsx-csi-node-c4spk 3/3 Running 0 38s
fsx-csi-node-k9dd6 3/3 Running 0 38s
7. ์๋น์ค ๊ณ์ ์ ์ฃผ์ ๋ฌ๊ธฐ
1
2
3
4
5
WSParticipantRole:~/environment $ kubectl annotate serviceaccount -n kube-system fsx-csi-controller-sa \
> eks.amazonaws.com/role-arn=$ROLE_ARN --overwrite=true
# ๊ฒฐ๊ณผ
serviceaccount/fsx-csi-controller-sa annotated
์๋น์ค ๊ณ์ ์ธ๋ถ ์ ๋ณด ์กฐํ
1
WSParticipantRole:~/environment $ kubectl get sa/fsx-csi-controller-sa -n kube-system -o yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::xxxxxxxxxxxxx:role/eksctl-eksworkshop-addon-iamserviceaccount-ku-Role1-eALfDkUyCxWi
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app.kubernetes.io/name":"aws-fsx-csi-driver"},"name":"fsx-csi-controller-sa","namespace":"kube-system"}}
creationTimestamp: "2025-04-19T02:19:50Z"
labels:
app.kubernetes.io/managed-by: eksctl
app.kubernetes.io/name: aws-fsx-csi-driver
name: fsx-csi-controller-sa
namespace: kube-system
resourceVersion: "873913"
uid: 7abe6847-4225-4740-9835-acd4ef985cd6
๐ฆ Create Persistent Volume on EKS Cluster
- ์ ์ ํ๋ก๋น์ ๋: ๊ด๋ฆฌ์๊ฐ FSx ์ธ์คํด์ค์ PV ์ ์๋ฅผ ๋ฏธ๋ฆฌ ์์ฑํ๊ณ PVC ์์ฒญ ์ ํ์ฉํจ
- ๋์ ํ๋ก๋น์ ๋: PVC ์์ฒญ ์ CSI ๋๋ผ์ด๋ฒ๊ฐ PV์ FSx ์ธ์คํด์ค๊น์ง ์๋์ผ๋ก ์์ฑํจ
- ์ด ๋ฉ์์๋ ์ ์ ํ๋ก๋น์ ๋ ๋ฐฉ์์ ์ฌ์ฉํจ
1. ์์ ๋๋ ํฐ๋ฆฌ ์ด๋
1
2
WSParticipantRole:~/environment $ cd /home/ec2-user/environment/eks/FSxL
WSParticipantRole:~/environment/eks/FSxL $
2. FSx Lustre ์ธ์คํด์ค ๋ณ์ ์ค์
1
2
3
WSParticipantRole:~/environment/eks/FSxL $ FSXL_VOLUME_ID=$(aws fsx describe-file-systems --query 'FileSystems[].FileSystemId' --output text)
WSParticipantRole:~/environment/eks/FSxL $ DNS_NAME=$(aws fsx describe-file-systems --query 'FileSystems[].DNSName' --output text)
WSParticipantRole:~/environment/eks/FSxL $ MOUNT_NAME=$(aws fsx describe-file-systems --query 'FileSystems[].LustreConfiguration.MountName' --output text)
3. Persistent Volume ํ์ผ ํ์ธ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# fsxL-persistent-volume.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: fsx-pv
spec:
persistentVolumeReclaimPolicy: Retain
capacity:
storage: 1200Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
mountOptions:
- flock
csi:
driver: fsx.csi.aws.com
volumeHandle: FSXL_VOLUME_ID
volumeAttributes:
dnsname: DNS_NAME
mountname: MOUNT_NAME
FSXL_VOLUME_ID
,ย DNS_NAME
, MOUNT_NAME
FSx Lustre ์ธ์คํด์ค์ ์ค์ ๊ฐ์ผ๋ก ์นํ
1
2
3
4
5
WSParticipantRole:~/environment/eks/FSxL $ sed -i'' -e "s/FSXL_VOLUME_ID/$FSXL_VOLUME_ID/g" fsxL-persistent-volume.yaml
WSParticipantRole:~/environment/eks/FSxL $ sed -i'' -e "s/DNS_NAME/$DNS_NAME/g" fsxL-persistent-volume.yaml
WSParticipantRole:~/environment/eks/FSxL $ sed -i'' -e "s/MOUNT_NAME/$MOUNT_NAME/g" fsxL-persistent-volume.yaml
WSParticipantRole:~/environment/eks/FSxL $ cat fsxL-persistent-volume.yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
apiVersion: v1
kind: PersistentVolume
metadata:
name: fsx-pv
spec:
persistentVolumeReclaimPolicy: Retain
capacity:
storage: 1200Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
mountOptions:
- flock
csi:
driver: fsx.csi.aws.com
volumeHandle: fs-032691accd783ccb1
volumeAttributes:
dnsname: fs-032691accd783ccb1.fsx.us-west-2.amazonaws.com
mountname: u42dbb4v
4. PersistentVolume(PV) ๋ฐฐํฌ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl apply -f fsxL-persistent-volume.yaml
# ๊ฒฐ๊ณผ
persistentvolume/fsx-pv created
PersistentVolume ์์ฑ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pv
โ ย ์ถ๋ ฅ
1
2
3
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS VOLUMEATTRIBUTESCLASS REASON AGE
fsx-pv 1200Gi RWX Retain Available <unset> 24s
pvc-47fa9ff1-7e9f-435d-9bfb-513f0df382e9 50Gi RWO Delete Bound kube-prometheus-stack/data-prometheus-kube-prometheus-stack-prometheus-0 gp3 <unset> 2d
5. PersistentVolumeClaim ํ์ผ ํ์ธ
1
2
3
4
5
6
7
8
9
10
11
12
13
# fsxL-claim.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: fsx-lustre-claim
spec:
accessModes:
- ReadWriteMany
storageClassName: ""
resources:
requests:
storage: 1200Gi
volumeName: fsx-pv
6. PersistentVolumeClaim ๋ฐฐํฌ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl apply -f fsxL-claim.yaml
# ๊ฒฐ๊ณผ
persistentvolumeclaim/fsx-lustre-claim created
7. PV/PVC ๋ฐ์ธ๋ฉ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pv,pvc
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS VOLUMEATTRIBUTESCLASS REASON AGE
persistentvolume/fsx-pv 1200Gi RWX Retain Bound default/fsx-lustre-claim <unset> 2m29s
persistentvolume/pvc-47fa9ff1-7e9f-435d-9bfb-513f0df382e9 50Gi RWO Delete Bound kube-prometheus-stack/data-prometheus-kube-prometheus-stack-prometheus-0 gp3 <unset> 2d
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE
persistentvolumeclaim/fsx-lustre-claim Bound fsx-pv 1200Gi RWX <unset> 28s
๐ Amazonย FSxย ์ฝ์์์ย ์ต์ ย ๋ฐย ์ฑ๋ฅย ์ธ๋ถ์ ๋ณดย ํ์ธ
1. FSx ์ฝ์ ์ด๋ ํ, Create file system ์ ํ
2. Amazon FSx for Lustreย ์ ํ ํ,ย Nextย ํด๋ฆญ
3. FSx for Lustre ๋ฐฐํฌ ์ต์ ์ธ๋ถ์ฌํญ ํ์ธ
4. Cancel ํด๋ฆญํด ์ค์ ํ๋ฉด ์ข ๋ฃ ํ, FSx ๋ชฉ๋ก์ผ๋ก ๋ณต๊ท
1200GiB FSx for Lustre ์ธ์คํด์ค๊ฐ ์ด๋ฏธ ํ๋ก๋น์ ๋ ๋์ด ์์
5. ๋ชจ๋ํฐ๋ง ๋ฐ ์ฑ๋ฅ ํญ ํ์ธ
์ฉ๋, ์ฒ๋ฆฌ๋, IOPS ์์ฝ ์งํ๋ถํฐ ๋ฉํ๋ฐ์ดํฐ ์ฑ๋ฅ, ๋คํธ์ํฌ ์งํ ํ์ธ
๐ค ์์ฑํ AI ์ฑํ ์ ํ๋ฆฌ์ผ์ด์ ๋ฐฐํฌ
1. AWS Inferentia Accelerators์ฉ Karpenter NodePool ๋ฐ EC2NodeClass ์์ฑ
(1) Cloud9 ํฐ๋ฏธ๋์์ ์์ ๋๋ ํ ๋ฆฌ๋ก ์ด๋
1
2
WSParticipantRole:~/environment/eks/FSxL $ cd /home/ec2-user/environment/eks/genai
WSParticipantRole:~/environment/eks/genai $
(2) NodePool ์ ์ ํ์ธ
1
WSParticipantRole:~/environment/eks/genai $ cat inferentia_nodepool.yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: inferentia
labels:
intent: genai-apps
NodeGroupType: inf2-neuron-karpenter
spec:
template:
spec:
taints:
- key: aws.amazon.com/neuron
value: "true"
effect: "NoSchedule"
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["inf2"]
- key: "karpenter.k8s.aws/instance-size"
operator: In
values: [ "xlarge", "2xlarge", "8xlarge", "24xlarge", "48xlarge"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: inferentia
limits:
cpu: 1000
memory: 1000Gi
disruption:
consolidationPolicy: WhenEmpty
# expireAfter: 720h # 30 * 24h = 720h
consolidateAfter: 180s
weight: 100
---
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: inferentia
spec:
amiFamily: AL2
amiSelectorTerms:
- alias: al2@v20240917
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
deleteOnTermination: true
volumeSize: 100Gi
volumeType: gp3
role: "Karpenter-eksworkshop"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "eksworkshop"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "eksworkshop"
tags:
intent: apps
managed-by: karpenter
(3) NodePool ๋ฐ EC2NodeClass ๋ฐฐํฌ
1
2
3
4
5
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f inferentia_nodepool.yaml
# ๊ฒฐ๊ณผ
nodepool.karpenter.sh/inferentia created
ec2nodeclass.karpenter.k8s.aws/inferentia created
(4) ์์ฑ๋ ๋ฆฌ์์ค ์ํ ํ์ธ
1
WSParticipantRole:~/environment/eks/genai $ kubectl get nodepool,ec2nodeclass inferentia
โ ย ์ถ๋ ฅ
1
2
3
4
5
NAME NODECLASS NODES READY AGE
nodepool.karpenter.sh/inferentia inferentia 0 True 35s
NAME READY AGE
ec2nodeclass.karpenter.k8s.aws/inferentia True 35s
2. ๋ด๋ฐ ๋๋ฐ์ด์ค ํ๋ฌ๊ทธ์ธ ๋ฐ ์ค์ผ์ค๋ฌ ์ค์น
(1) ๋ด๋ฐ ๋๋ฐ์ด์ค ํ๋ฌ๊ทธ์ธ ์ค์น
1
2
3
4
5
6
7
8
9
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin-rbac.yml
# ๊ฒฐ๊ณผ
Warning: resource clusterroles/neuron-device-plugin is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
clusterrole.rbac.authorization.k8s.io/neuron-device-plugin configured
Warning: resource serviceaccounts/neuron-device-plugin is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
serviceaccount/neuron-device-plugin configured
Warning: resource clusterrolebindings/neuron-device-plugin is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
clusterrolebinding.rbac.authorization.k8s.io/neuron-device-plugin configured
1
2
3
4
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin.yml
# ๊ฒฐ๊ณผ
daemonset.apps/neuron-device-plugin-daemonset created
(2) ๋ด๋ฐ ์ค์ผ์ค๋ฌ ์ค์น
1
2
3
4
5
6
7
8
9
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-scheduler-eks.yml
# ๊ฒฐ๊ณผ
clusterrole.rbac.authorization.k8s.io/k8s-neuron-scheduler created
serviceaccount/k8s-neuron-scheduler created
clusterrolebinding.rbac.authorization.k8s.io/k8s-neuron-scheduler created
Warning: spec.template.metadata.annotations[scheduler.alpha.kubernetes.io/critical-pod]: non-functional in v1.16+; use the "priorityClassName" field instead
deployment.apps/k8s-neuron-scheduler created
service/k8s-neuron-scheduler created
1
2
3
4
5
6
7
8
9
10
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/my-scheduler.yml
# ๊ฒฐ๊ณผ
serviceaccount/my-scheduler created
clusterrolebinding.rbac.authorization.k8s.io/my-scheduler-as-kube-scheduler created
clusterrolebinding.rbac.authorization.k8s.io/my-scheduler-as-volume-scheduler created
clusterrole.rbac.authorization.k8s.io/my-scheduler created
clusterrolebinding.rbac.authorization.k8s.io/my-scheduler created
configmap/my-scheduler-config created
deployment.apps/my-scheduler created
3. vLLM ์ ํ๋ฆฌ์ผ์ด์ ํ๋ ๋ฐฐํฌ
(1) vLLM ํ๋ ๋ฐ ์๋น์ค ๋ฐฐํฌ - ๋๋ต 7~8๋ถ ์์
1
2
3
4
5
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f mistral-fsxl.yaml
# ๊ฒฐ๊ณผ
deployment.apps/vllm-mistral-inf2-deployment created
service/vllm-mistral7b-service created
(2) mistral-fsxl.yaml
ํ์ผ ํ์ธ
1
WSParticipantRole:~/environment/eks/genai $ cat mistral-fsxl.yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-mistral-inf2-deployment
spec:
replicas: 1
selector:
matchLabels:
app: vllm-mistral-inf2-server
template:
metadata:
labels:
app: vllm-mistral-inf2-server
spec:
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
containers:
- name: inference-server
image: public.ecr.aws/u3r1l1j7/eks-genai:neuronrayvllm-100G-root
resources:
requests:
aws.amazon.com/neuron: 1
limits:
aws.amazon.com/neuron: 1
args:
- --model=$(MODEL_ID)
- --enforce-eager
- --gpu-memory-utilization=0.96
- --device=neuron
- --max-num-seqs=4
- --tensor-parallel-size=2
- --max-model-len=10240
- --served-model-name=mistralai/Mistral-7B-Instruct-v0.2-neuron
env:
- name: MODEL_ID
value: /work-dir/Mistral-7B-Instruct-v0.2/
- name: NEURON_COMPILE_CACHE_URL
value: /work-dir/Mistral-7B-Instruct-v0.2/neuron-cache/
- name: PORT
value: "8000"
volumeMounts:
- name: persistent-storage
mountPath: "/work-dir"
volumes:
- name: persistent-storage
persistentVolumeClaim:
claimName: fsx-lustre-claim
---
apiVersion: v1
kind: Service
metadata:
name: vllm-mistral7b-service
spec:
selector:
app: vllm-mistral-inf2-server
ports:
- protocol: TCP
port: 80
targetPort: 8000
(3) ๋ฐฐํฌ ์ํ ๋ชจ๋ํฐ๋ง
1
WSParticipantRole:~/environment/eks/genai $ kubectl get pod
โ ย ์ถ๋ ฅ
1
2
3
NAME READY STATUS RESTARTS AGE
kube-ops-view-5d9d967b77-w75cm 1/1 Running 0 2d
vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb 0/1 ContainerCreating 0 87s
4. EKS ์ฝ์์์ Inferentia ๋ ธ๋ ํ์ธ
(1) eksworkshop ํด๋ฌ์คํฐ ์ ํ
(2) Compute ํญ์์ inf2.xlarge
ย ๋
ธ๋ ํ์ธ
(3) ๋
ธ๋ ์ด๋ฆย ํด๋ฆญ ํ, ์ฉ๋ ํ ๋น ๋ฐ Pod ์ธ๋ถ ์ ๋ณด ํ์ธ
1
2
3
4
WSParticipantRole:~/environment/eks/genai $ kubectl get pod
NAME READY STATUS RESTARTS AGE
kube-ops-view-5d9d967b77-w75cm 1/1 Running 0 2d
vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb 1/1 Running 0 8m2s
๐ฌย ๋ชจ๋ธ๊ณผ ์ํธ ์์ฉํ๊ธฐ ์ํ WebUI ์ฑํ ์ ํ๋ฆฌ์ผ์ด์ ๋ฐฐํฌ
1. WebUI ์ ํ๋ฆฌ์ผ์ด์ ๋ฐฐํฌ
1
2
3
4
5
6
WSParticipantRole:~/environment/eks/genai $ kubectl apply -f open-webui.yaml
# ๊ฒฐ๊ณผ
deployment.apps/open-webui-deployment created
service/open-webui-service created
ingress.networking.k8s.io/open-webui-ingress created
2. WebUI ์ธํฐํ์ด์ค URL ํ์ธ
1
WSParticipantRole:~/environment/eks/genai $ kubectl get ing
โ ย ์ถ๋ ฅ
1
2
NAME CLASS HOSTS ADDRESS PORTS AGE
open-webui-ingress alb * open-webui-ingress-628580148.us-west-2.elb.amazonaws.com 80 32s
3. WebUI ์ ์
open-webui-ingress-628580148.us-west-2.elb.amazonaws.com
4. ๋ชจ๋ธ ์ ํ ๋ฐ ์ฑํ ์์
Mistralโ7B ๋ฐ์ดํฐ ํ์ธ ๋ฐ ์์ฑ๋ ๋ฐ์ดํฐ ์์ฐ ๊ณต์ ยท ๋ณต์
๐ S3์ ์ฐ๋๋ FSx for Lustre ์ธ์คํด์ค์ ๋ํ ํฌ๋ก์ค ๋ฆฌ์ ๋ณต์ ์ค์ ํ๊ธฐ
1. Amazon S3 ์ฝ์ ์ ์
2. ํด๋น ์ง์ญ์ ์์ฑ๋ S3 ๋ฒํท ์ ํ (2ndregion์ด ์๋ S3 ๋ฒํท x)
3. Replication rules ์์ฑ
Management โ Create replication rules ์ ํ
4. Enable Bucket Versioning ์ ํ
5. Source bucket ์ค์
Limit the scope of this rule using one or more filters ์ ํ ํ, ํํฐ๊ฐ์ผ๋ก test/
์
๋ ฅ
6. Destination
Browse S3 โ fsx-lustre-bucket-2ndregion-xxxx
โ Choose path ์ ํ
7. IAM ์ญํ ์ง์
s3-cross-region-replication-role* ์ ํ
8. Encryption
9. Save
Replicate existing objects? โ No, do not replicate existing objects โ Submit
10. Replication rules ์์ฑ ํ์ธ
๐งช Mistral-7B ๋ฐ์ดํฐ ๊ฒ์ฌ ๋ฐ ํ ์คํธ ํ์ผ ์์ฑ
1. ์์ ๋๋ ํฐ๋ฆฌ ์ด๋
1
2
WSParticipantRole:~/environment/eks/genai $ cd /home/ec2-user/environment/eks/FSxL
WSParticipantRole:~/environment/eks/FSxL $
2. ํ๋ ๋ชฉ๋ก ์กฐํ
1
2
3
4
5
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pods
NAME READY STATUS RESTARTS AGE
kube-ops-view-5d9d967b77-w75cm 1/1 Running 0 2d1h
open-webui-deployment-5d7ff94bc9-wfv9v 1/1 Running 0 26m
vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb 1/1 Running 0 35m
vllm-
์ผ๋ก ์์ํ๋ ํ๋ ์ด๋ฆ ๋ณต์ฌ
1
vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb
3. vLLM ํ๋ ๋ก๊ทธ์ธ
1
2
3
WSParticipantRole:~/environment/eks/FSxL $ kubectl exec -it vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb -- bash
bash: /home/ray/anaconda3/lib/libtinfo.so.6: no version information available (required by bash)
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:~#
4. PV ๋ง์ดํธ ์์น ํ์ธ
1
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:~# df -h
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
Filesystem Size Used Avail Use% Mounted on
overlay 100G 23G 78G 23% /
tmpfs 64M 0 64M 0% /dev
tmpfs 7.7G 0 7.7G 0% /sys/fs/cgroup
10.0.34.188@tcp:/u42dbb4v 1.2T 28G 1.1T 3% /work-dir
/dev/nvme0n1p1 100G 23G 78G 23% /etc/hosts
shm 64M 0 64M 0% /dev/shm
tmpfs 15G 12K 15G 1% /run/secrets/kubernetes.io/serviceaccount
tmpfs 7.7G 0 7.7G 0% /proc/acpi
tmpfs 7.7G 0 7.7G 0% /sys/firmware
workย -dir
์ FSx for Lustre ๊ธฐ๋ฐ PV์
5. PV์ ์ ์ฅ๋ ๋ด์ฉ ํ์ธ
1
2
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:~# cd /work-dir/
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir# ls -ll
โ ย ์ถ๋ ฅ
1
2
3
total 297
drwxr-xr-x 5 root root 33280 Apr 16 19:30 Mistral-7B-Instruct-v0.2
-rw-r--r-- 1 root root 151289 Apr 16 19:32 sysprep
Mistral-7B ๋ชจ๋ธ ํด๋ ํ์ธ
1
2
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir# cd Mistral-7B-Instruct-v0.2/
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir/Mistral-7B-Instruct-v0.2# ls -ll
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
total 2550
drwxr-xr-x 3 root root 33280 Apr 16 19:30 -split
-rwxr-xr-x 1 root root 5471 Apr 16 19:05 README.md
-rwxr-xr-x 1 root root 596 Apr 16 19:05 config.json
-rwxr-xr-x 1 root root 111 Apr 16 19:05 generation_config.json
-rwxr-xr-x 1 root root 25125 Apr 16 19:05 model.safetensors.index.json
drwxr-xr-x 3 root root 33280 Apr 16 19:30 neuron-cache
-rwxr-xr-x 1 root root 23950 Apr 16 19:05 pytorch_model.bin.index.json
-rwxr-xr-x 1 root root 414 Apr 16 19:05 special_tokens_map.json
-rwxr-xr-x 1 root root 1795188 Apr 16 19:05 tokenizer.json
-rwxr-xr-x 1 root root 493443 Apr 16 19:05 tokenizer.model
-rwxr-xr-x 1 root root 2103 Apr 16 19:05 tokenizer_config.json
6. ํ ์คํธ ํด๋ ๋ฐ ํ์ผ ์์ฑ
1
2
3
4
5
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir/Mistral-7B-Instruct-v0.2# cd /work-dir
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir# mkdir test
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir# cd test
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir/test# cp /work-dir/Mistral-7B-Instruct-v0.2/README.md /work-dir/test/testfile
(base) root@vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb:/work-dir/test# ls -ll /work-dir/test
โ ย ์ถ๋ ฅ
1
2
total 1
-rwxr-xr-x 1 root root 5471 Apr 18 20:33 testfile
9. S3 ๋ฒํท ๋ณต์ ๊ฒฐ๊ณผ ํ์ธ
(2) ์๋ณธ ๋ฒํท์์ ๋ฐ์ดํฐ ํ์ธ
(3) ๋์ ๋ฒํท(2ndregion)์์ ๋ณต์ ๋ ๋ฐ์ดํฐ ํ์ธ
โ๏ธย ๋ฐ์ดํฐ ๊ณ์ธต ํ ์คํธ๋ฅผ ์ํ ์์ฒด ํ๊ฒฝ ์์ฑ
1. ํ๊ฒฝ ๋ณ์ ์ค์
1
2
3
WSParticipantRole:~/environment/eks/FSxL $ VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --region $AWS_REGION --query "cluster.resourcesVpcConfig.vpcId" --output text)
WSParticipantRole:~/environment/eks/FSxL $ SUBNET_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --region $AWS_REGION --query "cluster.resourcesVpcConfig.subnetIds[0]" --output text)
WSParticipantRole:~/environment/eks/FSxL $ SECURITY_GROUP_ID=$(aws ec2 describe-security-groups --filters Name=vpc-id,Values=${VPC_ID} Name=group-name,Values="FSxLSecurityGroup01" --query "SecurityGroups[*].GroupId" --output text)
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ echo $SUBNET_ID
subnet-047814202808347bc
WSParticipantRole:~/environment/eks/FSxL $ echo $SECURITY_GROUP_ID
sg-00e9611cf170ffb25
2. ์์ ๋๋ ํ ๋ฆฌ ์ด๋
1
2
WSParticipantRole:~/environment/eks/FSxL $ cd /home/ec2-user/environment/eks/FSxL
WSParticipantRole:~/environment/eks/FSxL $
3. StorageClass ์์ฑ
(1) fsxL-storage-class.yaml
ํ์ธ
1
2
3
4
5
6
7
8
9
10
11
12
13
# fsxL-storage-class.yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: fsx-lustre-sc
provisioner: fsx.csi.aws.com
parameters:
subnetId: SUBNET_ID
securityGroupIds: SECURITY_GROUP_ID
deploymentType: SCRATCH_2
fileSystemTypeVersion: "2.15"
mountOptions:
- flock
(2) SUBNET_ID
, SECURITY_GROUP_ID
์นํ
1
2
WSParticipantRole:~/environment/eks/FSxL $ sed -i'' -e "s/SUBNET_ID/$SUBNET_ID/g" fsxL-storage-class.yaml
WSParticipantRole:~/environment/eks/FSxL $ sed -i'' -e "s/SECURITY_GROUP_ID/$SECURITY_GROUP_ID/g" fsxL-storage-class.yaml
(3) fsxL-storage-class.yaml
์ฌํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ cat fsxL-storage-class.yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
---
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: fsx-lustre-sc
provisioner: fsx.csi.aws.com
parameters:
subnetId: subnet-047814202808347bc
securityGroupIds: sg-00e9611cf170ffb25
deploymentType: SCRATCH_2
fileSystemTypeVersion: "2.15"
mountOptions:
- flock
(4) StorageClass ๋ฐฐํฌ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl apply -f fsxL-storage-class.yaml
# ๊ฒฐ๊ณผ
storageclass.storage.k8s.io/fsx-lustre-sc created
(5) StorageClass ์์ฑ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl get sc
โ ย ์ถ๋ ฅ
1
2
3
4
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
fsx-lustre-sc fsx.csi.aws.com Delete Immediate false 28s
gp2 kubernetes.io/aws-ebs Delete WaitForFirstConsumer false 2d1h
gp3 (default) ebs.csi.aws.com Delete WaitForFirstConsumer true 2d1h
5. PVC ์์ฑ
(1) fsxL-dynamic-claim.yaml
ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ cat fsxL-dynamic-claim.yaml
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: fsx-lustre-dynamic-claim
spec:
accessModes:
- ReadWriteMany
storageClassName: fsx-lustre-sc
resources:
requests:
storage: 1200Gi
(2) PersistentVolumeClaim(PVC) ์์ฑ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl apply -f fsxL-dynamic-claim.yaml
# ๊ฒฐ๊ณผ
persistentvolumeclaim/fsx-lustre-dynamic-claim created
(3) PVC ์ํ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl describe pvc/fsx-lustre-dynamic-claim
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
Name: fsx-lustre-dynamic-claim
Namespace: default
StorageClass: fsx-lustre-sc
Status: Pending
Volume:
Labels: <none>
Annotations: volume.beta.kubernetes.io/storage-provisioner: fsx.csi.aws.com
volume.kubernetes.io/storage-provisioner: fsx.csi.aws.com
Finalizers: [kubernetes.io/pvc-protection]
Capacity:
Access Modes:
VolumeMode: Filesystem
Used By: <none>
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Provisioning 28s fsx.csi.aws.com_fsx-csi-controller-6f4c577bd4-4wxrs_3244bfde-da2e-414b-913b-9a55b306d020 External provisioner is provisioning volume for claim "default/fsx-lustre-dynamic-claim"
Normal ExternalProvisioning 11s (x4 over 28s) persistentvolume-controller Waiting for a volume to be created either by the external provisioner 'fsx.csi.aws.com' or manually by the system administrator. If volume creation is delayed, please verify that the provisioner is running and correctly registered.
๋ฐ์ธ๋ฉ ๋๊ธฐ ์ํ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pvc
โ ย ์ถ๋ ฅ
1
2
3
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE
fsx-lustre-claim Bound fsx-pv 1200Gi RWX <unset> 73m
fsx-lustre-dynamic-claim Pending fsx-lustre-sc <unset> 77s
6. PVC ๋ฐ์ธ๋ฉ ์๋ฃ ํ์ธ
์ฝ 15๋ถ ์ ๋ ๊ธฐ๋ค๋ฆฐ ํ, PVC ์ํ Bound ํ์ธ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE
fsx-lustre-claim Bound fsx-pv 1200Gi RWX <unset> 96m
fsx-lustre-dynamic-claim Bound pvc-1c121fa2-7362-4c0b-a2bf-dfe4d0a24b11 1200Gi RWX fsx-lustre-sc <unset> 24m
FSx ์ฝ์์์ ์ธ์คํด์ค ์ํ ํ์ธ
๐ ์ฑ๋ฅ ํ ์คํธ
1. ์์ ๋๋ ํ ๋ฆฌ ์ด๋
1
2
WSParticipantRole:~/environment/eks/FSxL $ cd /home/ec2-user/environment/eks/FSxL
WSParticipantRole:~/environment/eks/FSxL $
2. ๊ฐ์ฉ์ฑ ์์ญ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ aws ec2 describe-subnets --subnet-id $SUBNET_ID --region $AWS_REGION | jq .Subnets[0].AvailabilityZone
โ ย ์ถ๋ ฅ
1
"us-west-2b"
3. ํ๋ ๊ตฌ์ฑ ์์
nodeSelector
,ยtopology.kubernetes.io/zone
ย ์ฃผ์ ์ฒ๋ฆฌ ์ ๊ฑฐ- ๊ฐ์ฉ์ฑ ์์ญ ๋ณ๊ฒฝ (
us-west-2b
)
4. ํ๋ ์์ฑ
1
2
3
4
WSParticipantRole:~/environment/eks/FSxL $ kubectl apply -f pod_performance.yaml
# ๊ฒฐ๊ณผ
pod/fsxl-performance created
5. ํ๋ ์ํ ํ์ธ
1
WSParticipantRole:~/environment/eks/FSxL $ kubectl get pods
โ ย ์ถ๋ ฅ
1
2
3
4
5
NAME READY STATUS RESTARTS AGE
fsxl-performance 1/1 Running 0 12s
kube-ops-view-5d9d967b77-w75cm 1/1 Running 0 2d1h
open-webui-deployment-5d7ff94bc9-wfv9v 1/1 Running 0 75m
vllm-mistral-inf2-deployment-7d886c8cc8-n6sbb 1/1 Running 0 84m
6. ์ปจํ ์ด๋ ๋ก๊ทธ์ธ
1
2
3
WSParticipantRole:~/environment/eks/FSxL $ kubectl exec -it fsxl-performance -- bash
root@fsxl-performance:/#
7. FIO ๋ฐ IOping ์ค์น
1
2
3
4
5
6
7
8
9
10
11
root@fsxl-performance:/# apt-get update
# ๊ฒฐ๊ณผ
Get:1 http://deb.debian.org/debian bookworm InRelease [151 kB]
Get:2 http://deb.debian.org/debian bookworm-updates InRelease [55.4 kB]
Get:3 http://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB]
Get:4 http://deb.debian.org/debian bookworm/main amd64 Packages [8792 kB]
Get:5 http://deb.debian.org/debian bookworm-updates/main amd64 Packages [512 B]
Get:6 http://deb.debian.org/debian-security bookworm-security/main amd64 Packages [255 kB]
Fetched 9303 kB in 1s (6521 kB/s)
Reading package lists... Done
1
2
3
4
5
6
7
8
root@fsxl-performance:/# apt-get install fio ioping -y
# ๊ฒฐ๊ณผ
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
...
Processing triggers for libc-bin (2.36-9+deb12u10) ...
8. ์ง์ฐ ์๊ฐ ํ ์คํธ (IOping)
1
root@fsxl-performance:/# ioping -c 20 .
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
4 KiB <<< . (overlay overlay 19.9 GiB): request=1 time=459.7 us (warmup)
4 KiB <<< . (overlay overlay 19.9 GiB): request=2 time=1.07 ms
4 KiB <<< . (overlay overlay 19.9 GiB): request=3 time=693.1 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=4 time=720.3 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=5 time=634.4 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=6 time=661.2 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=7 time=581.8 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=8 time=742.6 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=9 time=647.6 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=10 time=730.4 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=11 time=809.5 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=12 time=713.6 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=13 time=604.9 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=14 time=1.34 ms
4 KiB <<< . (overlay overlay 19.9 GiB): request=15 time=700.8 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=16 time=591.3 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=17 time=698.9 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=18 time=928.0 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=19 time=638.9 us
4 KiB <<< . (overlay overlay 19.9 GiB): request=20 time=632.7 us
--- . (overlay overlay 19.9 GiB) ioping statistics ---
19 requests completed in 14.1 ms, 76 KiB read, 1.34 k iops, 5.25 MiB/s
generated 20 requests in 19.0 s, 80 KiB, 1 iops, 4.21 KiB/s
min/avg/max/mdev = 581.8 us / 744.1 us / 1.34 ms / 181.8 us
9. ๋ถํ ํ ์คํธ
1
2
3
root@fsxl-performance:/# mkdir -p /data/performance
root@fsxl-performance:/# cd /data/performance
root@fsxl-performance:/data/performance#
1
root@fsxl-performance:/data/performance# fio --randrepeat=1 --ioengine=libaio --direct=1 --gtod_reduce=1 --name=fiotest --filename=testfio8gb --bs=1MB --iodepth=64 --size=8G --readwrite=randrw --rwmixread=50 --numjobs=8 --group_reporting --runtime=10
โ ย ์ถ๋ ฅ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
fiotest: (g=0): rw=randrw, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=libaio, iodepth=64
...
fio-3.33
Starting 8 processes
fiotest: Laying out IO file (1 file / 8192MiB)
Jobs: 6 (f=5): [f(1),_(1),m(3),_(1),m(2)][38.9%][r=89.0MiB/s,w=100MiB/s][r=89,w=100 IOPS][eta 00m:22s]
fiotest: (groupid=0, jobs=8): err= 0: pid=723: Sat Apr 19 04:24:09 2025
read: IOPS=96, BW=96.4MiB/s (101MB/s)(1394MiB/14460msec)
bw ( KiB/s): min=20461, max=390963, per=100.00%, avg=120798.95, stdev=11723.27, samples=156
iops : min= 17, max= 380, avg=116.50, stdev=11.43, samples=156
write: IOPS=97, BW=97.2MiB/s (102MB/s)(1406MiB/14460msec); 0 zone resets
bw ( KiB/s): min=18403, max=391048, per=100.00%, avg=116698.51, stdev=11301.16, samples=161
iops : min= 13, max= 380, avg=112.42, stdev=11.04, samples=161
cpu : usr=0.09%, sys=0.76%, ctx=5389, majf=0, minf=53
IO depths : 1=0.3%, 2=0.6%, 4=1.1%, 8=2.3%, 16=4.6%, 32=9.1%, >=64=82.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=99.7%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.3%, >=64=0.0%
issued rwts: total=1394,1406,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=64
Run status group 0 (all jobs):
READ: bw=96.4MiB/s (101MB/s), 96.4MiB/s-96.4MiB/s (101MB/s-101MB/s), io=1394MiB (1462MB), run=14460-14460msec
WRITE: bw=97.2MiB/s (102MB/s), 97.2MiB/s-97.2MiB/s (102MB/s-102MB/s), io=1406MiB (1474MB), run=14460-14460msec
10. ํ ์คํธ ์ข ๋ฃ
1
2
root@fsxl-performance:/data/performance# exit
exit