687 lines
20 KiB
YAML
Raw Normal View History

2023-03-28 08:48:09 +00:00
# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
---
AWSTemplateFormatVersion: '2010-09-09'
Description: 'AWS CloudFormation Sample Template Managed Single ECS Job Queue: This
template demonstrates the usage of simple Job Queue and EC2 style Compute Environment
along with multi-node jobs (on a relatively small scale: 4 instances, 16 cores each).
N.B.: This is all boilerplate until the EcsInstanceRole! '
Parameters:
ProjectName:
Type: String
Default: "proofs"
Description: "S3 bucket will be AccountId-Region-ProjectName"
AvailZoneId:
Type: String
Default: "a"
Description: "Availability Zone ID"
InstanceType:
Type: String
Default: "m6i.4xlarge"
Description: "Instance type"
ContainerMemory:
Type: String
Description: "Memory Size for containers (61000 for cloud, 253000 for parallel)"
AmiImageId:
Type: String
Description: "Ami for your instances"
Resources:
VPC:
Type: AWS::EC2::VPC
Properties:
CidrBlock: 10.0.0.0/16
EnableDnsHostnames: true
EnableDnsSupport: true
SecondCidr:
Type: AWS::EC2::VPCCidrBlock
Properties:
CidrBlock: 10.1.0.0/16
VpcId: !Ref VPC
InternetGateway:
Type: AWS::EC2::InternetGateway
EIP:
Type: 'AWS::EC2::EIP'
Properties:
Domain: vpc
VPCGatewayAttachment:
Type: AWS::EC2::VPCGatewayAttachment
Properties:
VpcId:
Ref: VPC
InternetGatewayId:
Ref: InternetGateway
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: EC2 Security Group for instances launched in the VPC by Batch
VpcId:
Ref: VPC
SecurityGroupIngress:
- CidrIp: 10.0.0.0/0
Description: SSH port
FromPort: 0
IpProtocol: TCP
ToPort: 65535
NatGateway:
Type: 'AWS::EC2::NatGateway'
Properties:
AllocationId: !GetAtt 'EIP.AllocationId'
SubnetId: !Ref SubnetPublic
NatAttachment:
Type: AWS::EC2::VPCGatewayAttachment
Properties:
InternetGatewayId: !Ref InternetGateway
VpcId: !Ref VPC
SubnetPrivate:
Type: AWS::EC2::Subnet
DependsOn: SecondCidr
Properties:
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
CidrBlock: 10.1.0.0/16
VpcId: !Ref VPC
RouteTablePrivate:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: VPC
RoutePrivate:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: RouteTablePrivate
DestinationCidrBlock: 0.0.0.0/0
NatGatewayId: !Ref NatGateway
SubnetRouteTableAssociationPrivate:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId: !Ref RouteTablePrivate
SubnetId: !Ref SubnetPrivate
SubnetPublic:
Type: AWS::EC2::Subnet
Properties:
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
CidrBlock: 10.0.1.0/24
VpcId: !Ref VPC
MapPublicIpOnLaunch: 'True'
RouteTablePublic:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: VPC
RoutePublic:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: RouteTablePublic
DestinationCidrBlock: 0.0.0.0/0
GatewayId: !Ref InternetGateway
SubnetRouteTableAssociationPublic:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId: !Ref RouteTablePublic
SubnetId: !Ref SubnetPublic
EcsCluster:
Type: AWS::ECS::Cluster
Properties:
ClusterName: SatCompCluster
ClusterSettings:
- Name: containerInsights
Value: enabled
Ec2AutoscaleInstanceProfile:
Type: "AWS::IAM::InstanceProfile"
Properties:
Path: "/"
Roles:
- Ref: "InstanceRole"
InstanceRole:
Type: AWS::IAM::Role
Properties:
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ec2.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: ecs-service
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Rules which allow ECS to attach network interfaces to instances
# on your behalf in order for awsvpc networking mode to work right
- 'ec2:AttachNetworkInterface'
- 'ec2:CreateNetworkInterface'
- 'ec2:CreateNetworkInterfacePermission'
- 'ec2:DeleteNetworkInterface'
- 'ec2:DeleteNetworkInterfacePermission'
- 'ec2:Describe*'
- 'ec2:DetachNetworkInterface'
- 'elasticfilesystem:*'
- 'cloudwatch:*'
- 'ecs:*'
# Rules which allow ECS to update load balancers on your behalf
# with the information sabout how to send traffic to your containers
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
- 'elasticloadbalancing:DeregisterTargets'
- 'elasticloadbalancing:Describe*'
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
- 'elasticloadbalancing:RegisterTargets'
- 's3:GetObject'
- 's3:GetObjectVersion'
Resource: '*'
EcsInstanceLc:
Type: AWS::AutoScaling::LaunchConfiguration
Properties:
ImageId: !Ref AmiImageId
InstanceType: !Select [ 0, [ !Ref InstanceType ] ]
AssociatePublicIpAddress: true
IamInstanceProfile: !GetAtt Ec2AutoscaleInstanceProfile.Arn
SecurityGroups: [ !Ref SecurityGroup ]
BlockDeviceMappings:
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 30
VolumeType: gp2
- DeviceName: /dev/xvdcz
Ebs:
VolumeSize: 22
VolumeType: gp2
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
echo ECS_CLUSTER=${EcsCluster} >> /etc/ecs/ecs.config
yum install -y aws-cfn-bootstrap python-pip
pip install awscli boto3
/opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
EcsInstanceAsg:
Type: AWS::AutoScaling::AutoScalingGroup
UpdatePolicy:
AutoScalingRollingUpdate:
MaxBatchSize: 10
MinSuccessfulInstancesPercent: 95
PauseTime: PT30M
SuspendProcesses: [ HealthCheck, ReplaceUnhealthy, AZRebalance, AlarmNotification,
ScheduledActions ]
WaitOnResourceSignals: 'true'
Properties:
VPCZoneIdentifier: [ !Ref SubnetPrivate ]
LaunchConfigurationName: !Ref EcsInstanceLc
MinSize: '0'
MaxSize: '0'
DesiredCapacity: '0'
Tags:
- Key: IsAutoscaledCluster
PropagateAtLaunch: true
Value: "true"
- Key: "Patch Group"
PropagateAtLaunch: true
Value: "ManagedClusterPatchGroup"
AutoscaleCapacityProvider:
Type: AWS::ECS::CapacityProvider
Properties:
AutoScalingGroupProvider:
AutoScalingGroupArn: !Ref EcsInstanceAsg
ManagedTerminationProtection: DISABLED
ManagedScaling:
Status: DISABLED
Name: AutoscaleCapacityProvider
# A security group for the containers we will run in Fargate.
# Rules are added to this security group based on what ingress you
# add for the cluster.
ContainerSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Access to the Fargate containers
VpcId: !Ref 'VPC'
# A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets
# on your AWS account
AutoscalingRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ application-autoscaling.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: service-autoscaling
PolicyDocument:
Statement:
- Effect: Allow
Action:
- 'application-autoscaling:*'
- 'cloudwatch:DescribeAlarms'
- 'cloudwatch:PutMetricAlarm'
- 'ecs:DescribeServices'
- 'ecs:UpdateService'
Resource: '*'
# This is an IAM role which authorizes ECS to manage resources on your
# account on your behalf, such as updating your load balancer with the
# details of where your containers are, so that traffic can reach your
# containers.
ECSRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ecs.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: ecs-service
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Rules which allow ECS to attach network interfaces to instances
# on your behalf in order for awsvpc networking mode to work right
- 'ec2:AttachNetworkInterface'
- 'ec2:CreateNetworkInterface'
- 'ec2:CreateNetworkInterfacePermission'
- 'ec2:DeleteNetworkInterface'
- 'ec2:DeleteNetworkInterfacePermission'
- 'ec2:Describe*'
- 'ec2:DetachNetworkInterface'
- 'elasticfilesystem:*'
# Rules which allow ECS to update load balancers on your behalf
# with the information sabout how to send traffic to your containers
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
- 'elasticloadbalancing:DeregisterTargets'
- 'elasticloadbalancing:Describe*'
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
- 'elasticloadbalancing:RegisterTargets'
Resource: '*'
SsmVpcEndpoint:
Type: AWS::EC2::VPCEndpoint
Properties:
VpcEndpointType: Interface
ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ssm'
VpcId: !Ref VPC
SecurityGroupIds: [ !Ref SecurityGroup ]
SubnetIds: [ !Ref SubnetPrivate ]
PrivateDnsEnabled: true
EcsTaskRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonEC2FullAccess
- arn:aws:iam::aws:policy/AmazonS3FullAccess
Policies:
- PolicyName: !Sub "project-metrics-${AWS::Region}-${ProjectName}"
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- cloudwatch:PutMetricData
Effect: Allow
Resource: "*"
- PolicyName: SatCompQueueExecutionRolePolicy
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- "sqs:GetQueueAttributes"
- "sqs:SendMessage"
- "sqs:ReceiveMessage"
- "sqs:DeleteMessage"
- "sqs:DeleteMessageBatch"
- "sqs:GetQueueUrl"
Effect: Allow
Resource: "*"
- PolicyName: SatCompDynamoExecutionRolePolicy
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- "dynamodb:UpdateItem"
- "dynamodb:Scan"
Effect: Allow
Resource: "*"
SolverLeaderTaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
TaskRoleArn: !GetAtt EcsTaskRole.Arn
NetworkMode: awsvpc
RequiresCompatibilities:
- EC2
Volumes:
- Name: SatVolume
EFSVolumeConfiguration:
FilesystemId: !Ref FileSystem
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId: !Ref AccessPoint
ContainerDefinitions:
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:leader"
Name: !Sub "${ProjectName}-leader"
MemoryReservation: 61000
Environment:
- Name: SQS_QUEUE_NAME
Value: !GetAtt SatCompQueue.QueueName
- Name: SQS_OUTPUT_QUEUE_NAME
Value: !GetAtt SatCompOutputQueue.QueueName
- Name: AWS_DEFAULT_REGION
Value: !Ref AWS::Region
- Name: SATCOMP_BUCKET_NAME
Value: !Ref SatCompBucket
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub "/ecs/${ProjectName}-leader"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
MountPoints:
- ContainerPath: '/mount/efs'
SourceVolume: SatVolume
PortMappings:
- HostPort: 22
ContainerPort: 22
Protocol: TCP
SolverLogGroupLeader:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub "/ecs/${ProjectName}-leader"
RetentionInDays: 1827
SolverWorkerTaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
TaskRoleArn: !GetAtt EcsTaskRole.Arn
NetworkMode: awsvpc
RequiresCompatibilities:
- EC2
Volumes:
- Name: SatVolume
EFSVolumeConfiguration:
FilesystemId: !Ref FileSystem
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId: !Ref AccessPoint
ContainerDefinitions:
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:worker"
Name: !Sub "${ProjectName}-worker"
MemoryReservation: 61000
Environment:
- Name: SQS_QUEUE_NAME
Value: !GetAtt SatCompQueue.QueueName
- Name: SQS_OUTPUT_QUEUE_NAME
Value: !GetAtt SatCompOutputQueue.QueueName
- Name: AWS_DEFAULT_REGION
Value: !Ref AWS::Region
- Name: SATCOMP_BUCKET_NAME
Value: !Ref SatCompBucket
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub "/ecs/${ProjectName}-worker"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
MountPoints:
- ContainerPath: '/mount/efs'
SourceVolume: SatVolume
PortMappings:
- HostPort: 22
ContainerPort: 22
Protocol: TCP
SolverLogGroupWorker:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub "/ecs/${ProjectName}-worker"
RetentionInDays: 1827
ECSTaskExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ecs-tasks.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: AmazonECSTaskExecutionRolePolicy
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Allow the ECS Tasks to download images from ECR
- 'ecr:GetAuthorizationToken'
- 'ecr:BatchCheckLayerAvailability'
- 'ecr:GetDownloadUrlForLayer'
- 'ecr:BatchGetImage'
# Allow the ECS tasks to upload logs to CloudWatch
- 'logs:CreateLogStream'
- 'logs:PutLogEvents'
Resource: '*'
SolverLeaderService:
Type: AWS::ECS::Service
Properties:
Cluster:
Ref: EcsCluster
DesiredCount: 0
LaunchType: EC2
NetworkConfiguration:
AwsvpcConfiguration:
SecurityGroups:
- !Ref SecurityGroup
Subnets:
- !Ref SubnetPrivate
TaskDefinition:
Ref: SolverLeaderTaskDefinition
SolverWorkerService:
Type: AWS::ECS::Service
Properties:
Cluster:
Ref: EcsCluster
DesiredCount: 0
LaunchType: EC2
NetworkConfiguration:
AwsvpcConfiguration:
SecurityGroups:
- !Ref SecurityGroup
Subnets:
- !Ref SubnetPrivate
TaskDefinition:
Ref: SolverWorkerTaskDefinition
AccessPoint:
Type: AWS::EFS::AccessPoint
Properties:
FileSystemId: !Ref FileSystem
PosixUser:
Uid: "1001"
Gid: "1001"
RootDirectory:
CreationInfo:
OwnerGid: "1001"
OwnerUid: "1001"
Permissions: "0755"
Path: "/sat-comp"
FileSystem:
Type: AWS::EFS::FileSystem
Properties:
PerformanceMode: generalPurpose
EFSMountTarget:
Type: AWS::EFS::MountTarget
Properties:
FileSystemId: !Ref FileSystem
SecurityGroups:
- !Ref SecurityGroup
SubnetId: !Ref SubnetPrivate
SatCompQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompQueue"
VisibilityTimeout: 5
ReceiveMessageWaitTimeSeconds: 20
KmsDataKeyReusePeriodSeconds: 3600
KmsMasterKeyId: "alias/aws/sqs"
SatCompOutputQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompOutputQueue"
VisibilityTimeout: 5
ReceiveMessageWaitTimeSeconds: 20
KmsDataKeyReusePeriodSeconds: 3600
KmsMasterKeyId: "alias/aws/sqs"
NodeManifest:
Type: AWS::DynamoDB::Table
Properties:
AttributeDefinitions:
-
AttributeName: nodeId
AttributeType: S
BillingMode: PAY_PER_REQUEST
KeySchema:
-
AttributeName: nodeId
KeyType: HASH
TableName: SatCompNodeManifest
TaskEndNotification:
Type: AWS::DynamoDB::Table
Properties:
AttributeDefinitions:
-
AttributeName: leaderIp
AttributeType: S
BillingMode: PAY_PER_REQUEST
KeySchema:
-
AttributeName: leaderIp
KeyType: HASH
TableName: TaskEndNotification
SolverLeaderEcr:
Type: AWS::ECR::Repository
Properties:
RepositoryName: !Sub "${ProjectName}"
LifecyclePolicy:
LifecyclePolicyText: '
{
"rules": [ {
"rulePriority": 10,
"description": "remove untagged images except the latest one",
"selection": {
"tagStatus": "untagged",
"countType": "imageCountMoreThan",
"countNumber": 1
},
"action": {
"type": "expire"
}
} ]
}'
# SolverWorkerEcr:
# Type: AWS::ECR::Repository
# Properties:
# RepositoryName: !Sub "${ProjectName}-worker"
# LifecyclePolicy:
# LifecyclePolicyText: '
# {
# "rules": [ {
# "rulePriority": 10,
# "description": "remove untagged images except the latest one",
# "selection": {
# "tagStatus": "untagged",
# "countType": "imageCountMoreThan",
# "countNumber": 1
# },
# "action": {
# "type": "expire"
# }
# } ]
# }'
SatCompBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub "${AWS::AccountId}-${AWS::Region}-${ProjectName}"
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true
Outputs:
Subnet:
Value:
Ref: SubnetPrivate
Export:
Name: SubnetId
SecurityGroupId:
Value:
Ref: SecurityGroup
Export:
Name: SecurityGroup