Distributed Training, is the ability to split the training of a model among multiple processors. It is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Each such split is a pod (see definition above). Run:ai spawns an additional launcher process that manages and coordinates the other worker pods. For more information, see Distributed training.
Use to create a distributed training.
Request completed successfully.
Bad submission request.
Unauthorized
Forbidden
unexpected error
{- "name": "my-workload-name",
- "useGivenNameAsPrefix": true,
- "projectId": 1,
- "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
- "spec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "numWorkers": 1,
- "distributedFramework": "MPI",
- "slotsPerWorker": 1,
- "minReplicas": 0,
- "maxReplicas": 0,
- "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}, - "masterSpecSameAsWorker": true,
- "masterSpec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}
}
{- "name": "my-workload-name",
- "requestedName": "string",
- "workloadId": "06d16c5d-4728-42fa-b573-3b11820d999f",
- "projectId": 1,
- "departmentId": 2,
- "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
- "createdBy": "test@lab.com",
- "createdAt": "2022-01-01T03:49:52.531Z",
- "desiredPhase": "Running",
- "actualPhase": "Creating",
- "spec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "numWorkers": 1,
- "distributedFramework": "MPI",
- "slotsPerWorker": 1,
- "minReplicas": 0,
- "maxReplicas": 0,
- "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}, - "masterSpecSameAsWorker": true,
- "masterSpec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}
}
Use to delete a distributed training by workload id.
No Content.
Unauthorized
Forbidden
The specified resource was not found
unexpected error
unexpected error
{- "code": 401,
- "message": "Issuer is not familiar."
}
Retrieve the details of a distributed training by workload id.
Executed successfully.
Unauthorized
Forbidden
The specified resource was not found
unexpected error
unexpected error
{- "name": "my-workload-name",
- "requestedName": "string",
- "workloadId": "06d16c5d-4728-42fa-b573-3b11820d999f",
- "projectId": 1,
- "departmentId": 2,
- "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
- "createdBy": "test@lab.com",
- "createdAt": "2022-01-01T03:49:52.531Z",
- "desiredPhase": "Running",
- "actualPhase": "Creating",
- "spec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "numWorkers": 1,
- "distributedFramework": "MPI",
- "slotsPerWorker": 1,
- "minReplicas": 0,
- "maxReplicas": 0,
- "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}, - "masterSpecSameAsWorker": true,
- "masterSpec": {
- "command": "python",
- "args": "-x my-script.py",
- "image": "python:3.8",
- "imagePullPolicy": "Always",
- "workingDir": "/home/myfolder",
- "createHomeDir": true,
- "probes": {
- "readiness": {
- "initialDelaySeconds": 0,
- "periodSeconds": 1,
- "timeoutSeconds": 1,
- "successThreshold": 1,
- "failureThreshold": 1,
- "handler": {
- "httpGet": {
- "path": "/",
- "port": 1,
- "host": "example.com",
- "scheme": "HTTP"
}
}
}
}, - "nodeType": "my-node-type",
- "nodePools": [
- "my-node-pool-a",
- "my-node-pool-b"
], - "podAffinity": {
- "type": "Required",
- "key": "string"
}, - "tty": true,
- "stdin": true,
- "environmentVariables": [
- {
- "name": "HOME",
- "value": "/home/my-folder",
- "secret": {
- "name": "postgress_secret",
- "key": "POSTGRES_PASSWORD"
}, - "exclude": false,
- "description": "Home directory of the user."
}
], - "annotations": [
- {
- "name": "billing",
- "value": "my-billing-unit",
- "exclude": false
}
], - "labels": [
- {
- "name": "stage",
- "value": "initial-research",
- "exclude": false
}
], - "tolerations": [
- {
- "name": "string",
- "operator": "Equal",
- "key": "string",
- "value": "string",
- "effect": "NoSchedule",
- "seconds": 1,
- "exclude": false
}
], - "terminateAfterPreemption": false,
- "autoDeletionTimeAfterCompletionSeconds": 15,
- "terminationGracePeriodSeconds": 20,
- "backoffLimit": 3,
- "ports": [
- {
- "container": 8080,
- "serviceType": "LoadBalancer",
- "external": 30080,
- "toolType": "pytorch",
- "toolName": "my-pytorch",
- "name": "port-instance-a"
}
], - "exposedUrls": [
- {
- "container": 8080,
- "authorizedUsers": [
- "user-a",
- "user-b"
], - "authorizedGroups": [
- "group-a",
- "group-b"
], - "toolType": "jupyter",
- "toolName": "my-pytorch",
- "name": "url-instance-a"
}
], - "compute": {
- "gpuDevicesRequest": 1,
- "gpuRequestType": "portion",
- "gpuPortionRequest": 0.5,
- "gpuPortionLimit": 0.5,
- "gpuMemoryRequest": "10M",
- "gpuMemoryLimit": "10M",
- "migProfile": "1g.5gb",
- "cpuCoreRequest": 0.5,
- "cpuCoreLimit": 2,
- "cpuMemoryRequest": "20M",
- "cpuMemoryLimit": "30M",
- "largeShmRequest": false,
- "extendedResources": [
- {
- "resource": "hardware-vendor.example/foo",
- "quantity": 2,
- "exclude": false
}
]
}, - "storage": {
- "dataVolume": [
- {
- "id": "123e4567-e89b-12d3-a456-426614174000",
- "mountPath": "/mnt/data"
}
], - "pvc": [
- {
- "name": "storage-instance-a",
- "path": "/container/my-claim",
- "existingPvc": false,
- "claimName": "my-claim",
- "readOnly": false,
- "ephemeral": false,
- "claimInfo": {
- "size": "1G",
- "storageClass": "my-storage-class",
- "accessModes": {
- "readWriteOnce": true,
- "readOnlyMany": false,
- "readWriteMany": false
}, - "volumeMode": "Filesystem"
}
}
], - "hostPath": [
- {
- "name": "storage-instance-a",
- "path": "/container/directory",
- "readOnly": true,
- "mountPath": "/local/directory",
- "mountPropagation": "None"
}
], - "nfs": [
- {
- "name": "storage-instance-a",
- "path": "/container/nfs",
- "readOnly": true,
- "server": "my.nfs.com",
- "mountPath": "/local/nfs"
}
], - "git": [
- {
- "name": "storage-instance-a",
- "branch": "main",
- "revision": "string",
- "path": "/container/my-repository",
- "passwordSecret": "my-password-secret",
- "secretKeyOfUser": "User",
- "secretKeyOfPassword": "Password"
}
], - "configMapVolume": [
- {
- "name": "storage-instance-a",
- "configMap": "string",
- "mountPath": "string"
}
], - "secretVolume": [
- {
- "name": "storage-instance-a",
- "mountPath": "string",
- "secret": "string"
}
], - "s3": [
- {
- "name": "storage-instance-a",
- "bucket": "my-bucket",
- "path": "/container/my-bucket",
- "accessKeySecret": "my-access-key-secret",
- "secretKeyOfAccessKeyId": "AccessKeyId",
- "secretKeyOfSecretKey": "SecretKey"
}
]
}, - "security": {
- "uidGidSource": "fromTheImage",
- "capabilities": [
- "CHOWN",
- "KILL"
], - "seccompProfileType": "RuntimeDefault",
- "runAsNonRoot": true,
- "readOnlyRootFilesystem": false,
- "runAsUid": 500,
- "runAsGid": 30,
- "supplementalGroups": "2,3,5,8",
- "allowPrivilegeEscalation": false,
- "hostIpc": false,
- "hostNetwork": false
}
}
}
Suspend a distributed training from running using a workload id.
Accepted.
Unauthorized
Forbidden
The specified resource was not found
unexpected error
unexpected error
{- "code": 202,
- "message": "Request has been accepted."
}
Resume a distributed training that was suspended using a workload id.
Accepted.
Unauthorized
Forbidden
The specified resource was not found
unexpected error
unexpected error
{- "code": 202,
- "message": "Request has been accepted."
}