# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 --- AWSTemplateFormatVersion: '2010-09-09' Description: 'AWS CloudFormation Sample Template Managed Single ECS Job Queue: This template demonstrates the usage of simple Job Queue and EC2 style Compute Environment along with multi-node jobs (on a relatively small scale: 4 instances, 16 cores each). N.B.: This is all boilerplate until the EcsInstanceRole! ' Parameters: ProjectName: Type: String Default: "proofs" Description: "S3 bucket will be AccountId-Region-ProjectName" AvailZoneId: Type: String Default: "a" Description: "Availability Zone ID" InstanceType: Type: String Default: "m6i.4xlarge" Description: "Instance type" ContainerMemory: Type: String Description: "Memory Size for containers (61000 for cloud, 253000 for parallel)" AmiImageId: Type: String Description: "Ami for your instances" Resources: VPC: Type: AWS::EC2::VPC Properties: CidrBlock: 10.0.0.0/16 EnableDnsHostnames: true EnableDnsSupport: true SecondCidr: Type: AWS::EC2::VPCCidrBlock Properties: CidrBlock: 10.1.0.0/16 VpcId: !Ref VPC InternetGateway: Type: AWS::EC2::InternetGateway EIP: Type: 'AWS::EC2::EIP' Properties: Domain: vpc VPCGatewayAttachment: Type: AWS::EC2::VPCGatewayAttachment Properties: VpcId: Ref: VPC InternetGatewayId: Ref: InternetGateway SecurityGroup: Type: AWS::EC2::SecurityGroup Properties: GroupDescription: EC2 Security Group for instances launched in the VPC by Batch VpcId: Ref: VPC SecurityGroupIngress: - CidrIp: 10.0.0.0/0 Description: SSH port FromPort: 0 IpProtocol: TCP ToPort: 65535 NatGateway: Type: 'AWS::EC2::NatGateway' Properties: AllocationId: !GetAtt 'EIP.AllocationId' SubnetId: !Ref SubnetPublic NatAttachment: Type: AWS::EC2::VPCGatewayAttachment Properties: InternetGatewayId: !Ref InternetGateway VpcId: !Ref VPC SubnetPrivate: Type: AWS::EC2::Subnet DependsOn: SecondCidr Properties: AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}" CidrBlock: 10.1.0.0/16 VpcId: !Ref VPC RouteTablePrivate: Type: AWS::EC2::RouteTable Properties: VpcId: Ref: VPC RoutePrivate: Type: AWS::EC2::Route Properties: RouteTableId: Ref: RouteTablePrivate DestinationCidrBlock: 0.0.0.0/0 NatGatewayId: !Ref NatGateway SubnetRouteTableAssociationPrivate: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: !Ref RouteTablePrivate SubnetId: !Ref SubnetPrivate SubnetPublic: Type: AWS::EC2::Subnet Properties: AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}" CidrBlock: 10.0.1.0/24 VpcId: !Ref VPC MapPublicIpOnLaunch: 'True' RouteTablePublic: Type: AWS::EC2::RouteTable Properties: VpcId: Ref: VPC RoutePublic: Type: AWS::EC2::Route Properties: RouteTableId: Ref: RouteTablePublic DestinationCidrBlock: 0.0.0.0/0 GatewayId: !Ref InternetGateway SubnetRouteTableAssociationPublic: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: !Ref RouteTablePublic SubnetId: !Ref SubnetPublic EcsCluster: Type: AWS::ECS::Cluster Properties: ClusterName: SatCompCluster ClusterSettings: - Name: containerInsights Value: enabled Ec2AutoscaleInstanceProfile: Type: "AWS::IAM::InstanceProfile" Properties: Path: "/" Roles: - Ref: "InstanceRole" InstanceRole: Type: AWS::IAM::Role Properties: ManagedPolicyArns: - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: [ ec2.amazonaws.com ] Action: [ 'sts:AssumeRole' ] Path: / Policies: - PolicyName: ecs-service PolicyDocument: Statement: - Effect: Allow Action: # Rules which allow ECS to attach network interfaces to instances # on your behalf in order for awsvpc networking mode to work right - 'ec2:AttachNetworkInterface' - 'ec2:CreateNetworkInterface' - 'ec2:CreateNetworkInterfacePermission' - 'ec2:DeleteNetworkInterface' - 'ec2:DeleteNetworkInterfacePermission' - 'ec2:Describe*' - 'ec2:DetachNetworkInterface' - 'elasticfilesystem:*' - 'cloudwatch:*' - 'ecs:*' # Rules which allow ECS to update load balancers on your behalf # with the information sabout how to send traffic to your containers - 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer' - 'elasticloadbalancing:DeregisterTargets' - 'elasticloadbalancing:Describe*' - 'elasticloadbalancing:RegisterInstancesWithLoadBalancer' - 'elasticloadbalancing:RegisterTargets' - 's3:GetObject' - 's3:GetObjectVersion' Resource: '*' EcsInstanceLc: Type: AWS::AutoScaling::LaunchConfiguration Properties: ImageId: !Ref AmiImageId InstanceType: !Select [ 0, [ !Ref InstanceType ] ] AssociatePublicIpAddress: true IamInstanceProfile: !GetAtt Ec2AutoscaleInstanceProfile.Arn SecurityGroups: [ !Ref SecurityGroup ] BlockDeviceMappings: - DeviceName: /dev/xvda Ebs: VolumeSize: 30 VolumeType: gp2 - DeviceName: /dev/xvdcz Ebs: VolumeSize: 22 VolumeType: gp2 UserData: Fn::Base64: !Sub | #!/bin/bash -xe echo ECS_CLUSTER=${EcsCluster} >> /etc/ecs/ecs.config yum install -y aws-cfn-bootstrap python-pip pip install awscli boto3 /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region} EcsInstanceAsg: Type: AWS::AutoScaling::AutoScalingGroup UpdatePolicy: AutoScalingRollingUpdate: MaxBatchSize: 10 MinSuccessfulInstancesPercent: 95 PauseTime: PT30M SuspendProcesses: [ HealthCheck, ReplaceUnhealthy, AZRebalance, AlarmNotification, ScheduledActions ] WaitOnResourceSignals: 'true' Properties: VPCZoneIdentifier: [ !Ref SubnetPrivate ] LaunchConfigurationName: !Ref EcsInstanceLc MinSize: '0' MaxSize: '0' DesiredCapacity: '0' Tags: - Key: IsAutoscaledCluster PropagateAtLaunch: true Value: "true" - Key: "Patch Group" PropagateAtLaunch: true Value: "ManagedClusterPatchGroup" AutoscaleCapacityProvider: Type: AWS::ECS::CapacityProvider Properties: AutoScalingGroupProvider: AutoScalingGroupArn: !Ref EcsInstanceAsg ManagedTerminationProtection: DISABLED ManagedScaling: Status: DISABLED Name: AutoscaleCapacityProvider # A security group for the containers we will run in Fargate. # Rules are added to this security group based on what ingress you # add for the cluster. ContainerSecurityGroup: Type: AWS::EC2::SecurityGroup Properties: GroupDescription: Access to the Fargate containers VpcId: !Ref 'VPC' # A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets # on your AWS account AutoscalingRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: [ application-autoscaling.amazonaws.com ] Action: [ 'sts:AssumeRole' ] Path: / Policies: - PolicyName: service-autoscaling PolicyDocument: Statement: - Effect: Allow Action: - 'application-autoscaling:*' - 'cloudwatch:DescribeAlarms' - 'cloudwatch:PutMetricAlarm' - 'ecs:DescribeServices' - 'ecs:UpdateService' Resource: '*' # This is an IAM role which authorizes ECS to manage resources on your # account on your behalf, such as updating your load balancer with the # details of where your containers are, so that traffic can reach your # containers. ECSRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: [ ecs.amazonaws.com ] Action: [ 'sts:AssumeRole' ] Path: / Policies: - PolicyName: ecs-service PolicyDocument: Statement: - Effect: Allow Action: # Rules which allow ECS to attach network interfaces to instances # on your behalf in order for awsvpc networking mode to work right - 'ec2:AttachNetworkInterface' - 'ec2:CreateNetworkInterface' - 'ec2:CreateNetworkInterfacePermission' - 'ec2:DeleteNetworkInterface' - 'ec2:DeleteNetworkInterfacePermission' - 'ec2:Describe*' - 'ec2:DetachNetworkInterface' - 'elasticfilesystem:*' # Rules which allow ECS to update load balancers on your behalf # with the information sabout how to send traffic to your containers - 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer' - 'elasticloadbalancing:DeregisterTargets' - 'elasticloadbalancing:Describe*' - 'elasticloadbalancing:RegisterInstancesWithLoadBalancer' - 'elasticloadbalancing:RegisterTargets' Resource: '*' SsmVpcEndpoint: Type: AWS::EC2::VPCEndpoint Properties: VpcEndpointType: Interface ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ssm' VpcId: !Ref VPC SecurityGroupIds: [ !Ref SecurityGroup ] SubnetIds: [ !Ref SubnetPrivate ] PrivateDnsEnabled: true EcsTaskRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: Service: ecs-tasks.amazonaws.com Action: sts:AssumeRole ManagedPolicyArns: - arn:aws:iam::aws:policy/AmazonEC2FullAccess - arn:aws:iam::aws:policy/AmazonS3FullAccess Policies: - PolicyName: !Sub "project-metrics-${AWS::Region}-${ProjectName}" PolicyDocument: Version: 2012-10-17 Statement: - Action: - cloudwatch:PutMetricData Effect: Allow Resource: "*" - PolicyName: SatCompQueueExecutionRolePolicy PolicyDocument: Version: 2012-10-17 Statement: - Action: - "sqs:GetQueueAttributes" - "sqs:SendMessage" - "sqs:ReceiveMessage" - "sqs:DeleteMessage" - "sqs:DeleteMessageBatch" - "sqs:GetQueueUrl" Effect: Allow Resource: "*" - PolicyName: SatCompDynamoExecutionRolePolicy PolicyDocument: Version: 2012-10-17 Statement: - Action: - "dynamodb:UpdateItem" - "dynamodb:Scan" Effect: Allow Resource: "*" SolverLeaderTaskDefinition: Type: AWS::ECS::TaskDefinition Properties: ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn TaskRoleArn: !GetAtt EcsTaskRole.Arn NetworkMode: awsvpc RequiresCompatibilities: - EC2 Volumes: - Name: SatVolume EFSVolumeConfiguration: FilesystemId: !Ref FileSystem TransitEncryption: ENABLED AuthorizationConfig: AccessPointId: !Ref AccessPoint ContainerDefinitions: - Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:leader" Name: !Sub "${ProjectName}-leader" MemoryReservation: 61000 Environment: - Name: SQS_QUEUE_NAME Value: !GetAtt SatCompQueue.QueueName - Name: SQS_OUTPUT_QUEUE_NAME Value: !GetAtt SatCompOutputQueue.QueueName - Name: AWS_DEFAULT_REGION Value: !Ref AWS::Region - Name: SATCOMP_BUCKET_NAME Value: !Ref SatCompBucket LogConfiguration: LogDriver: awslogs Options: awslogs-group: !Sub "/ecs/${ProjectName}-leader" awslogs-region: !Ref AWS::Region awslogs-stream-prefix: ecs MountPoints: - ContainerPath: '/mount/efs' SourceVolume: SatVolume PortMappings: - HostPort: 22 ContainerPort: 22 Protocol: TCP SolverLogGroupLeader: Type: AWS::Logs::LogGroup Properties: LogGroupName: !Sub "/ecs/${ProjectName}-leader" RetentionInDays: 1827 SolverWorkerTaskDefinition: Type: AWS::ECS::TaskDefinition Properties: ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn TaskRoleArn: !GetAtt EcsTaskRole.Arn NetworkMode: awsvpc RequiresCompatibilities: - EC2 Volumes: - Name: SatVolume EFSVolumeConfiguration: FilesystemId: !Ref FileSystem TransitEncryption: ENABLED AuthorizationConfig: AccessPointId: !Ref AccessPoint ContainerDefinitions: - Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:worker" Name: !Sub "${ProjectName}-worker" MemoryReservation: 61000 Environment: - Name: SQS_QUEUE_NAME Value: !GetAtt SatCompQueue.QueueName - Name: SQS_OUTPUT_QUEUE_NAME Value: !GetAtt SatCompOutputQueue.QueueName - Name: AWS_DEFAULT_REGION Value: !Ref AWS::Region - Name: SATCOMP_BUCKET_NAME Value: !Ref SatCompBucket LogConfiguration: LogDriver: awslogs Options: awslogs-group: !Sub "/ecs/${ProjectName}-worker" awslogs-region: !Ref AWS::Region awslogs-stream-prefix: ecs MountPoints: - ContainerPath: '/mount/efs' SourceVolume: SatVolume PortMappings: - HostPort: 22 ContainerPort: 22 Protocol: TCP SolverLogGroupWorker: Type: AWS::Logs::LogGroup Properties: LogGroupName: !Sub "/ecs/${ProjectName}-worker" RetentionInDays: 1827 ECSTaskExecutionRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: [ ecs-tasks.amazonaws.com ] Action: [ 'sts:AssumeRole' ] Path: / Policies: - PolicyName: AmazonECSTaskExecutionRolePolicy PolicyDocument: Statement: - Effect: Allow Action: # Allow the ECS Tasks to download images from ECR - 'ecr:GetAuthorizationToken' - 'ecr:BatchCheckLayerAvailability' - 'ecr:GetDownloadUrlForLayer' - 'ecr:BatchGetImage' # Allow the ECS tasks to upload logs to CloudWatch - 'logs:CreateLogStream' - 'logs:PutLogEvents' Resource: '*' SolverLeaderService: Type: AWS::ECS::Service Properties: Cluster: Ref: EcsCluster DesiredCount: 0 LaunchType: EC2 NetworkConfiguration: AwsvpcConfiguration: SecurityGroups: - !Ref SecurityGroup Subnets: - !Ref SubnetPrivate TaskDefinition: Ref: SolverLeaderTaskDefinition SolverWorkerService: Type: AWS::ECS::Service Properties: Cluster: Ref: EcsCluster DesiredCount: 0 LaunchType: EC2 NetworkConfiguration: AwsvpcConfiguration: SecurityGroups: - !Ref SecurityGroup Subnets: - !Ref SubnetPrivate TaskDefinition: Ref: SolverWorkerTaskDefinition AccessPoint: Type: AWS::EFS::AccessPoint Properties: FileSystemId: !Ref FileSystem PosixUser: Uid: "1001" Gid: "1001" RootDirectory: CreationInfo: OwnerGid: "1001" OwnerUid: "1001" Permissions: "0755" Path: "/sat-comp" FileSystem: Type: AWS::EFS::FileSystem Properties: PerformanceMode: generalPurpose EFSMountTarget: Type: AWS::EFS::MountTarget Properties: FileSystemId: !Ref FileSystem SecurityGroups: - !Ref SecurityGroup SubnetId: !Ref SubnetPrivate SatCompQueue: Type: AWS::SQS::Queue Properties: QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompQueue" VisibilityTimeout: 5 ReceiveMessageWaitTimeSeconds: 20 KmsDataKeyReusePeriodSeconds: 3600 KmsMasterKeyId: "alias/aws/sqs" SatCompOutputQueue: Type: AWS::SQS::Queue Properties: QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompOutputQueue" VisibilityTimeout: 5 ReceiveMessageWaitTimeSeconds: 20 KmsDataKeyReusePeriodSeconds: 3600 KmsMasterKeyId: "alias/aws/sqs" NodeManifest: Type: AWS::DynamoDB::Table Properties: AttributeDefinitions: - AttributeName: nodeId AttributeType: S BillingMode: PAY_PER_REQUEST KeySchema: - AttributeName: nodeId KeyType: HASH TableName: SatCompNodeManifest TaskEndNotification: Type: AWS::DynamoDB::Table Properties: AttributeDefinitions: - AttributeName: leaderIp AttributeType: S BillingMode: PAY_PER_REQUEST KeySchema: - AttributeName: leaderIp KeyType: HASH TableName: TaskEndNotification SolverLeaderEcr: Type: AWS::ECR::Repository Properties: RepositoryName: !Sub "${ProjectName}" LifecyclePolicy: LifecyclePolicyText: ' { "rules": [ { "rulePriority": 10, "description": "remove untagged images except the latest one", "selection": { "tagStatus": "untagged", "countType": "imageCountMoreThan", "countNumber": 1 }, "action": { "type": "expire" } } ] }' # SolverWorkerEcr: # Type: AWS::ECR::Repository # Properties: # RepositoryName: !Sub "${ProjectName}-worker" # LifecyclePolicy: # LifecyclePolicyText: ' # { # "rules": [ { # "rulePriority": 10, # "description": "remove untagged images except the latest one", # "selection": { # "tagStatus": "untagged", # "countType": "imageCountMoreThan", # "countNumber": 1 # }, # "action": { # "type": "expire" # } # } ] # }' SatCompBucket: Type: AWS::S3::Bucket Properties: BucketName: !Sub "${AWS::AccountId}-${AWS::Region}-${ProjectName}" BucketEncryption: ServerSideEncryptionConfiguration: - ServerSideEncryptionByDefault: SSEAlgorithm: AES256 PublicAccessBlockConfiguration: BlockPublicAcls: true BlockPublicPolicy: true IgnorePublicAcls: true RestrictPublicBuckets: true Outputs: Subnet: Value: Ref: SubnetPrivate Export: Name: SubnetId SecurityGroupId: Value: Ref: SecurityGroup Export: Name: SecurityGroup