增加 aws 示例
3
aws-batch-comp-infrastructure-sample/.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
# Default ignored files
|
||||
/workspace.xml
|
12
aws-batch-comp-infrastructure-sample/.idea/aws-batch-comp-infrastructure-sample.iml
generated
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="pytest" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
||||
</component>
|
||||
</module>
|
6
aws-batch-comp-infrastructure-sample/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
aws-batch-comp-infrastructure-sample/.idea/misc.xml
generated
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
aws-batch-comp-infrastructure-sample/.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/aws-batch-comp-infrastructure-sample.iml" filepath="$PROJECT_DIR$/.idea/aws-batch-comp-infrastructure-sample.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
aws-batch-comp-infrastructure-sample/.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
4
aws-batch-comp-infrastructure-sample/CODE_OF_CONDUCT.md
Normal file
@ -0,0 +1,4 @@
|
||||
## Code of Conduct
|
||||
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
|
||||
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
|
||||
opensource-codeofconduct@amazon.com with any additional questions or comments.
|
61
aws-batch-comp-infrastructure-sample/CONTRIBUTING.md
Normal file
@ -0,0 +1,61 @@
|
||||
# Contributing Guidelines
|
||||
|
||||
Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
|
||||
documentation, we greatly value feedback and contributions from our community.
|
||||
|
||||
Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
|
||||
information to effectively respond to your bug report or contribution.
|
||||
|
||||
|
||||
## Reporting Bugs/Feature Requests
|
||||
|
||||
We welcome you to use the GitHub issue tracker to report bugs or suggest features.
|
||||
|
||||
When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
|
||||
reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
|
||||
|
||||
* A reproducible test case or series of steps
|
||||
* The version of our code being used
|
||||
* Any modifications you've made relevant to the bug
|
||||
* Anything unusual about your environment or deployment
|
||||
|
||||
|
||||
## Contributing via Pull Requests
|
||||
Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
|
||||
|
||||
1. You are working against the latest source on the *master* branch.
|
||||
2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
|
||||
3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
|
||||
|
||||
To send us a pull request, please:
|
||||
|
||||
1. Fork the repository.
|
||||
2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
|
||||
3. Ensure local tests pass.
|
||||
4. Commit to your fork using clear commit messages.
|
||||
5. Send us a pull request, answering any default questions in the pull request interface.
|
||||
6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
|
||||
|
||||
GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
|
||||
[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
|
||||
|
||||
|
||||
## Finding contributions to work on
|
||||
Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
|
||||
|
||||
|
||||
## Code of Conduct
|
||||
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
|
||||
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
|
||||
opensource-codeofconduct@amazon.com with any additional questions or comments.
|
||||
|
||||
|
||||
## Security issue notifications
|
||||
If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
|
||||
|
||||
|
||||
## Licensing
|
||||
|
||||
See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
|
||||
|
||||
We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
|
15
aws-batch-comp-infrastructure-sample/LICENSE
Normal file
@ -0,0 +1,15 @@
|
||||
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
75
aws-batch-comp-infrastructure-sample/README-changes.md
Normal file
@ -0,0 +1,75 @@
|
||||
# Changes from 2022 SAT/SMT-Comp Interactions
|
||||
|
||||
The fundamental architecture for the SAT/SMT competition is unchanged from 2022. However, based on feedback from competitors in 2022, we have made some small changes to how the infrastructure interacts with the solvers provided by each team through the `input.json` file and `solver-out.json` files.
|
||||
|
||||
## Changes to input.json
|
||||
|
||||
We have enriched the `input.json` file to allow solvers to be invoked with different arguments / solver names to allow testing of different strategies without rebuilding the container, and to specify a timeout for the solver.
|
||||
|
||||
The 2022 format for input.json was as follows:
|
||||
|
||||
```text
|
||||
{
|
||||
"problem_path": "/mount/efs/problem.cnf",
|
||||
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
|
||||
}
|
||||
```
|
||||
|
||||
In 2023, we have expanded this format to:
|
||||
|
||||
```text
|
||||
{
|
||||
"formula_file": "/mount/efs/problem.cnf",
|
||||
"formula_language": "DIMACS",
|
||||
"solver_argument_list": ["-p", "another_arg"],
|
||||
"timeout_seconds": 10,
|
||||
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
|
||||
}
|
||||
```
|
||||
|
||||
where:
|
||||
* `formula_file` is the path to the SAT/SMT problem to be solved.
|
||||
* `formula_language` is the encoding of the problem (currently we use DIMACS for SAT-Comp and SMTLIB2 for SMT-Comp). This field is optional and can be ignored by the solver.
|
||||
* `solver_argument_list` allows passthrough of arguments to the solver. This allows you to try different strategies without rebuilding your docker container by varying the arguments.
|
||||
* `timeout_seconds` is the timeout for the solver. It will be enforced by the infrastructure; a solver that doesn't complete within the timeout will be terminated.
|
||||
* `worker_node_ips` is unchanged; for cloud solvers, it is the list of worker nodes. For parallel solvers this field will always be the empty list.
|
||||
|
||||
The executable / python file that starts the solver should be updated to support this new format.
|
||||
|
||||
## Changes to solver_out.json
|
||||
|
||||
The 2022 format for solver_out.json was as follows:
|
||||
|
||||
```text
|
||||
{
|
||||
"return_code": Number, // of running the solve command
|
||||
"result": String, // Should be one of {"SAT", "UNSAT", "UNKNOWN", "ERROR"}
|
||||
"artifacts": {
|
||||
"stdout_path": String, // Where to find the stdout of the solve run
|
||||
"stderr_path": String // Where to find the stderr of the solve run
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In 2023, we have modified the format to:
|
||||
|
||||
```text
|
||||
{
|
||||
"return_code": int, // The return code for the solver.
|
||||
"artifacts": {
|
||||
"stdout_path": String, // Where to find the stdout of the solve run
|
||||
"stderr_path": String // Where to find the stderr of the solve run
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
where:
|
||||
* `return_code` is the return code for the solver. **N.B.**: Unlike the 2022 format, the return_code in `solver_out.json` will determine the satisfiability of the problem for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
|
||||
* `stdout_path` is the path to the stdout log.
|
||||
* `stderr_path` is the path to the stderr log.
|
||||
|
||||
The Mallob example solver has been updated to support the new format, so you can examine these changes by examining the leader solver python file [here](docker/mallob-images/leader/solver). This file is compatible with the new format, but does not use the `formula_language`, `timeout_seconds` or `solver_argument_list` fields.
|
||||
|
||||
Other than these changes, solvers should be able to be run unchanged on the 2023 architecture. Much more information on solver development is available in the [Solver Development README](docker/README-Solver-Development.md).
|
||||
|
||||
If you have problems please contact us at either sat-comp@amazon.com (for SAT-Comp) or aws-smtcomp-2023@googlegroups.com (for SMT-Comp).
|
82
aws-batch-comp-infrastructure-sample/README.md
Normal file
@ -0,0 +1,82 @@
|
||||
# SAT-Comp and SMT-Comp Cloud Track Instructions
|
||||
|
||||
This repository will help you get your parallel or distributed solver running efficiently on AWS. You will build docker containers with your solver and then connect them to the AWS infrastructure.
|
||||
|
||||
We recommend that you work in four steps:
|
||||
|
||||
1. Create and configure an AWS Account for the competition. Please do this right away and send us an email, so we can start the process to give you AWS credits. You can then continue with step 2 while waiting for us to answer.
|
||||
2. Build your solver as a Docker image and run experiments locally.
|
||||
3. Set up AWS infrastructure and test your solver on the cloud.
|
||||
4. When ready, share the solver repository and Docker image with us.
|
||||
|
||||
We describe step 1 in this document. Step 2 is described in the [Solver Development README](docker/README-Solver-Development.md), while steps 3 and 4 are described in the [Infrastructure README](infrastructure/README-Infrastructure.md).
|
||||
|
||||
|
||||
## Creating an AWS Account
|
||||
|
||||
First, create a specific AWS account for the competition. If you have not created an AWS account previously, it is straightforward to do, requiring a cell phone number, credit card, and address. Make sure to register your account with an institutional email address (and not a private one), otherwise AWS cannot sponsor your account. To create an account, navigate to [aws.amazon.com](https://aws.amazon.com) and follow the instructions.
|
||||
|
||||
If you have previously created an AWS account, we strongly advise that you create a separate AWS account for managing the SAT/SMT-Comp tool construction and testing. This makes it much easier for us to manage account credits and billing. Once the new account is created, email us the account number at: sat-comp@amazon.com (for SAT-Comp) or aws-smtcomp-2023@googlegroups.com (for SMT-Comp) and we will apply the appropriate credits to your account.
|
||||
|
||||
To find your account ID, click on your account name in the top right corner, and then click "My Account". You should see Account ID in the Account Settings
|
||||
|
||||
It is important that you tell us your account number immediately after creating the account, so that we can assign you a resource budget for your experiments. We also need to grant you access to the shared problem sets. Once we hear from you, we will email you an acknowledgment when resources have been added to your account.
|
||||
|
||||
## Building Your Solver
|
||||
|
||||
Next, it is time to develop your solver! All of the development and most of the testing can be performed on a local laptop, so it is not necessary to wait for AWS credits to get started. Please look at the instructions in the [Solver Development README](docker/README-Solver-Development.md) on how to start building and testing your solver.
|
||||
|
||||
## For Returning Competitors:
|
||||
We have worked hard to improve the developer experience. As compared with the 2022 infrastructure, we have added or modified:
|
||||
|
||||
1. Scripts and instructions for local debugging of both parallel and distributed solvers on your workstation/laptop.
|
||||
1. Automated upload of stdout/stderr and intermediate files for each run to S3 storage so that you can debug runs after completion.
|
||||
1. A "quickstart" that allows you to build the infrastructure and solver, then run it using only two commands.
|
||||
1. Default values for most of the required arguments from 2022 to make it much simpler to run commands.
|
||||
1. A richer `input.json` format that is used for solver communication that allows passing arguments to solvers, a timeout, and information about the input format.
|
||||
1. A change in the expected solver output format in `solver_out.json`. This year, the satisfiability/unsatisfiability of the problem is indicated by the return code of the solver (10 for SAT, 20 for UNSAT)
|
||||
1. Various improvements and bug fixes in the scripts for building/running jobs to provide better observability of the process.
|
||||
|
||||
Note that the changes to `input.json` and `solver_out.json` are breaking changes for solvers, described in more detail in [README-changes.md](README-changes.md). The changes should require only a short effort to update for 2023.
|
||||
|
||||
## Additional Resources: Analysis Problems
|
||||
|
||||
The SAT problems for the SAT-Comp 2022 competition are available [here](https://satcompetition.github.io/2022/downloads.html). The SMT problems for SMT-Comp 2022 competition are available [here](https://smt-comp.github.io/2022/benchmarks.html).
|
||||
|
||||
## Additional Resources: Solvers
|
||||
|
||||
Here are github repositories for the solvers from the 2022 competitions. **Please Note:** the
|
||||
infrastructure for 2023 is slightly changed to facilitate better debugging and easier build. In order to run these solvers on the current infrastructure, you must update `input.json` and `solver_out.json` as described in [README-changes.md](README-changes.md).
|
||||
|
||||
SAT-Comp Parallel:
|
||||
* [ParKissat-RS](https://github.com/mww-aws/ParKissat/tree/RS)
|
||||
* [ParKissat-PRE](https://github.com/mww-aws/ParKissat/tree/PRE)
|
||||
* [PaKis22](https://github.com/KTRDeveloper/PaKis22)
|
||||
* [PaKisMAB22](https://github.com/KTRDeveloper/PaKisMAB22)
|
||||
* [DPS-Kissat](https://github.com/nabesima/DPS-satcomp2022)
|
||||
* [NPS-Kissat](https://github.com/nabesima/DPS-satcomp2022/tree/non-det)
|
||||
* [P-Kissat](https://github.com/vvallade/painless-sat-competition-2022/tree/pkissat)
|
||||
* [P-MCOMSPS](https://github.com/vvallade/painless-sat-competition-2022)
|
||||
* [Mallob-ki](https://github.com/domschrei/isc22-mallob/tree/ki)
|
||||
* [gimsatul](https://github.com/arminbiere/gimsatul)
|
||||
|
||||
SAT-Comp Cloud:
|
||||
* [Paracooba](https://github.com/maximaximal/paracooba-satcomp22)
|
||||
* [Mallob-kicaliglu](https://github.com/domschrei/isc22-mallob/tree/kicaliglu)
|
||||
|
||||
SMT-Comp Parallel:
|
||||
* [SMTS Cube and Conquer](https://github.com/usi-verification-and-security/aws-smts/tree/parallel-cube-and-conquer-fixed)
|
||||
* [SMTS Portfolio](https://github.com/usi-verification-and-security/aws-smts/tree/parallel-portfolio)
|
||||
* [Vampire](https://github.com/vprover/vampire/tree/smtcomp22)
|
||||
|
||||
SMT-Comp Cloud:
|
||||
* [cvc5-cloud](https://github.com/amaleewilson/aws-satcomp-solver-sample/tree/cvc5)
|
||||
* [SMTS Cube and Conquer](https://github.com/usi-verification-and-security/aws-smts/tree/cloud-cube-and-conquer-fixed)
|
||||
* [SMTS Portfolio](https://github.com/usi-verification-and-security/aws-smts/tree/cloud-portfolio)
|
||||
* [Vampire](https://github.com/vprover/vampire/tree/smtcomp22)
|
||||
|
||||
## FAQ
|
||||
|
||||
#### I already created my AWS account with a non-institutional email address. Can I still change the email address tied to my account?
|
||||
|
||||
Yes. To change your email address, follow the instructions at https://repost.aws/knowledge-center/change-email-address.
|
4
aws-batch-comp-infrastructure-sample/RELEASE-NOTES.txt
Normal file
@ -0,0 +1,4 @@
|
||||
Release Notes:
|
||||
==============
|
||||
03/10/2022: Version 2. Updated to use new base containers and run in ECS.
|
||||
02/25/2020: Version 1. Sample code documenting how to run SAT solvers in AWS Batch.
|
@ -0,0 +1,328 @@
|
||||
# Solver Development for SAT/SMT-Comp
|
||||
|
||||
This README covers the process of building your solver and embedding it in docker images. We first show the steps to build and run an example solver, Mallob, then describe how to build your own solver within Docker and test it on your local machine.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Platforms Amazon Linux 2 (AL2), Ubuntu 20, and Mac OS Monterey (v12.6) have been tested successfully. Builds on other platforms may work, but have not been tested.
|
||||
|
||||
To build and run solvers, you will need the following tools installed:
|
||||
|
||||
- [python3](https://www.python.org/). To install the latest version for your platform, go to the [downloads](https://www.python.org/downloads/) page.
|
||||
- [docker](https://www.docker.com/). There is a button to download Docker Desktop on the main page.
|
||||
|
||||
Basic familiarity with Docker will be helpful, but we will walk you through step-by-step. If you need more information, there are a number of excellent tutorials, such as [this](https://docs.docker.com/get-started/).
|
||||
|
||||
|
||||
# Example: Mallob
|
||||
|
||||
This example covers two configurations: parallel (single node, multiple cores) and distributed (multiple nodes, multiple cores per node). Because building a distributed solver is a superset of of building a parallel solver; we will note steps that are only required for a distributed solver.
|
||||
|
||||
We use [Mallob](https://github.com/domschrei/mallob) as the running example for both parallel and distributed solvers. This example pulls a version of Mallob that we have tested with the AWS infrastructure. Note that although this repository is released under the MIT-0 license, the Dockerfiles use the Mallob project. Mallob and the solvers it uses have other licenses, including the [LGPL 3.0](https://opensource.org/license/lgpl-3-0/) license.
|
||||
|
||||
This section proceeds in three steps:
|
||||
- Building base SATcomp Docker images
|
||||
- Building Mallob Docker images
|
||||
- Running Mallob in Docker
|
||||
|
||||
## Building the SATComp Base Images
|
||||
|
||||
To simplify the solver construction process, we provide base Docker images that manage the infrastructure necessary for solvers as well as access to AWS resources. We will build three images, a common image, a leader, and a worker. One leader coordinates multiple workers. The Dockerfiles and needed resources are contained in the [satcomp-images](satcomp-images) directory.
|
||||
|
||||
To begin, navigate to the `satcomp-images` directory and execute the `build_satcomp_images.sh` script. This script will build three images: `satcomp-infrastructure:common`, `satcomp-infrastructure:leader`, and `satcomp-infrastructure:worker` that correspond to the common, leader, and worker images.
|
||||
|
||||
### Checking Docker Build Results
|
||||
|
||||
After buliding the docker images, check to make sure the images have built successfully:
|
||||
|
||||
1. Run `docker image ls` or `docker images`
|
||||
2. Make sure that you see `satcomp-infrastructure:common`, `satcomp-infrastructure:leader`, and `satcomp-infrastructure:worker` in the list of images.
|
||||
|
||||
You should get a response similar to
|
||||
|
||||
% docker image ls
|
||||
REPOSITORY TAG IMAGE ID CREATED SIZE
|
||||
satcomp-infrastructure worker 83e1a25f57a9 5 minutes ago 819MB
|
||||
satcomp-infrastructure leader 766b446bd057 5 minutes ago 817MB
|
||||
satcomp-infrastructure common 51da12f359f8 6 minutes ago 725MB
|
||||
ubuntu 20.04 e40cf56b4be3 7 days ago 72.8MB
|
||||
%
|
||||
|
||||
## Building Mallob Images
|
||||
|
||||
To build the mallob distributed solver images, we will use the satcomp infrastructure worker and leader images built previously. To begin, cd into the `mallob-images` directory, which contains the needed Dockerfiles and other infrastructure.
|
||||
|
||||
To build all three docker images in one step, execute the `build_mallob_images.sh` script. You will want to create a similar script for your solver. More information on the script and how to build each docker image individually is available in the Q&A section of the document.
|
||||
|
||||
After building the mallob images, run `docker image ls` and make sure you see both `satcomp-mallob:leader` and `satcomp-mallob:worker` in the list of images.
|
||||
|
||||
This is what `docker image ls` shows for the tutorial to this point:
|
||||
|
||||
% docker image ls
|
||||
REPOSITORY TAG IMAGE ID CREATED SIZE
|
||||
satcomp-mallob worker f3a2276355eb 2 minutes ago 916MB
|
||||
satcomp-mallob leader c3045f85f3bc 2 minutes ago 914MB
|
||||
satcomp-mallob common 4bcf7c5a156e 2 minutes ago 1.58GB
|
||||
satcomp-infrastructure worker dad677687909 11 minutes ago 819MB
|
||||
satcomp-infrastructure leader c439dfb34537 12 minutes ago 817MB
|
||||
satcomp-infrastructure common 3b2935f84a7c 12 minutes ago 725MB
|
||||
ubuntu 20.04 e40cf56b4be3 2 weeks ago 72.8MB
|
||||
|
||||
## Running Mallob
|
||||
|
||||
We will use `docker run` to create docker _containers_ (processes) as instances of the docker _images_ we created in the previous section.
|
||||
|
||||
To find out which docker containers are running, use the command `docker ps`. The command `docker ps -a` will include containers that have already exited. To remove a container, run `docker rm <CONTAINER_ID>`. To remove all containers, run `docker rm $(docker ps -aq)`.
|
||||
|
||||
### Creating a Docker Network
|
||||
|
||||
Before running mallob we need to create a docker bridge network that our containers can attach to. This is necessary for both parallel and distributed mallob. Create a network named `mallob-test` by running the command `docker network create mallob-test`.
|
||||
|
||||
### Running Parallel Mallob
|
||||
|
||||
To run parallel Mallob, navigate to the `runner` directory. We have created a simple shell script called `run_parallel.sh`, which you will use to run the mallob_parallel docker image, starting a container and running a SAT/SMT problem in the container. The script has two variables that can be configured if you wish (described in Q&A) but are set to sensible defaults.
|
||||
|
||||
**N.B.:** Because the docker image runs as a different user and group than the local host, you need to set the directory permissions so that Docker image can read and write to the directory. Please run: `sudo chgrp -R 1000 . && chmod 775 .` from the `docker/runner` directory so that the container can access this portion of the filesystem..
|
||||
|
||||
The `run_parallel.sh` script requires two command-line arguments.
|
||||
- <docker_image_name>, which is `satcomp-mallob` for this example.
|
||||
- <query_file>, which is the name of the test file for the solver. If you use the defaults, you should put SAT/SMT files in the docker/runner/experiment subdirectory, and if you run the script from the `runner` directory, then you can use standard shell completion for paths.
|
||||
|
||||
To run the script with the `test.cnf` file we provided, call `run_parallel.sh satcomp-mallob experiment/test.cnf` from within the `runner` directory. After creating a container, the script will drop you into a bash shell for this container.
|
||||
|
||||
The script will create an `input.json` file in the host run directory. This file will be copied to the docker run directory `/rundir`, where it will be read by the solver script.
|
||||
|
||||
The script comments explain the various arguments to the `docker run` command. The `docker run` invocation uses bash as the docker entrypoint and passes `init_solver.sh` as a command to bash. The initialization script starts sshd and then runs `/competition/solver /rundir`, which will execute the solver on the query in `<query_file>`. At this point, you should see mallob STDOUT and STDERR on the terminal as mallob runs the solver query.
|
||||
|
||||
### Running Distributed Mallob
|
||||
|
||||
Running distributed mallob requires two docker invocations running in two different terminal windows: one to start a leader container and one to start a worker container.
|
||||
|
||||
To run distributed Mallob, again cd into the `runner` directory. You will use two shell scripts, `run_dist_worker.sh` and `run_dist_leader.sh`.
|
||||
|
||||
- Step 1. Invoke `run_dist_worker.sh`, which requires a single command-line argument <docker_image_name>, which is `satcomp-mallob` for this example. Notice that the script will launch a `satcomp-mallob:worker` container. You will be dropped into a bash shell for the worker container. No further commands are needed.
|
||||
- Step 2. From a different terminal on the host machine (in the same `runner` directory), invoke `run_dist_leader.sh` This script requires the same two command-line arguments as `run_parallel.sh` in the previous section. For example, you can call `run_dist_leader.sh satcomp-mallob experiment/test.cnf`
|
||||
|
||||
The `run_dist_leader` script will again create an `input.json` file, with more fields than used for parallel mallob. Again, you should see mallob output on the terminal.
|
||||
|
||||
### Debugging
|
||||
|
||||
After the solver query runs, you will be left in a bash shell prompt that is executing in the docker leader container. At that point, you can explore. You will see the `/competition` directory with the solver scripts. You will also see the `/rundir` directory that includes your test file, the solver input file `input.json`, and three output files:
|
||||
|
||||
If mallob ran successfully, you will find three new files in `/rundir`:
|
||||
|
||||
- `solver_out.json`
|
||||
- `stdout.log`
|
||||
- `stderr.log`
|
||||
|
||||
Again, note that the container directory `/rundir` is the mount point for the host run directory as specified by the script variable `HOST_RUNDIR`. The files in `/rundir` will persist on the host filesystem, even after the docker container is stopped or removed.
|
||||
|
||||
At this point, you can perform additional experiments or exit the docker shell.
|
||||
|
||||
The competition infrastructure starts solver containers and keeps them running for multiple queries. Each query will have a new `input.json` file, and `/container/solver` will be run again.
|
||||
|
||||
**N.B.:** When debugging your own solver, the key step (other than making sure your solver ran correctly) is to ensure that you clean up resources between runs of the solver. You should ensure that no solver processes are running and any temporary files are removed between executions. During the competition, the docker images will be left running throughout and each SAT/SMT problem will be injected into the running container. You are responsible for cleaning up files and processes created by your solver. In the case of Mallob, it performs the cleanup of temporary files for the leader when the solver starts (rather than when it finishes), so that you can inspect them after the solver completes execution.
|
||||
|
||||
To check for orphaned jobs, use the `ps -ax` in both the leader and worker containers. This should show you all running processes. Make sure there aren't any stray processes that continue execution. In addition, check all the locations in the container where your solver places temporary files in order to make sure that they are removed after the run.
|
||||
|
||||
If your solver doesn't run correctly in the docker container, you can remove the `/container/solver` commands from the `init_solver.sh` files. Once you are dropped into the docker container's bash shell, you can explore and debug directly, including running `/container/solver /rundir` from the container shell command line.
|
||||
|
||||
We have now run, debugged, and inspected a sample solver. It is a good idea to try out multiple files, inspect the results, and try running in both cloud and parallel configurations. Once you are comfortable with these interactions, it is time to start working on your own solver.
|
||||
|
||||
# Preparing Your Own Solver Images
|
||||
|
||||
In the previous section, we walked through building and running a sample solver in both cloud and parallel configurations. We assume at this point that you know how to build with Docker and how to run it locally using the scripts that we have provided. We recommend that you run through the Mallob example before you start building your own images, but of course you can start here and then follow the same steps as described in "Running Mallob" using your own images.
|
||||
|
||||
In this section, we'll talk about how to build your own solvers into Docker containers that are derived from the base containers that we provide. All of the interaction with AWS services for deployment on the cloud is managed by the Docker base images. We provide two base images: one for leader nodes and one for worker nodes (N.B.: workers are only required for the cloud track). The Leader Node is responsible for collecting IP addresses of available worker nodes before starting the solving process, pulling work from an SQS queue, downloading problem instances from S3, and sharing and coordinating with Worker Nodes. The Worker Node base container is responsible for reporting its status and IP address to the Leader Node. As these base images manage the interaction with AWS, you can build and test your solvers locally, and they should work the same way when deployed on the cloud.
|
||||
|
||||
## Building from Competition Base Containers
|
||||
|
||||
Your solver must be buildable from source code using a standard process. We use Docker to create a build process that can easily be standardized. Docker removes many of the platform-specific problems with building under one operating system and running in another.
|
||||
|
||||
You will provide a GitHub repo with a Dockerfile in the top directory that we will use to build your solver. This first section of this README constructed just such a solver. In this guide, we start by building and running the solver locally, then once it is working properly, we will use the [Infrastructure README](../infrastructure/README-Infrastructure.md) to test it using AWS resources. We will use the same process to host your solver during the competition.
|
||||
|
||||
For the cloud track, you will need to supply two Dockerfiles for your solver, one that acts as a Leader Node and one that acts as a Worker Node. In the case of a parallel solver, you only need to provide a single image, which we will also call a Leader Node for convenience.
|
||||
|
||||
The Docker base images are contained in the `satcomp-images` directory. As you follow this example, you should also consult the `mallob-images` directory to see how we based the Mallob examples on the images in `satcomp-images`.
|
||||
|
||||
You should begin your Dockerfiles with:
|
||||
|
||||
```text
|
||||
FROM satcomp-infrastructure:leader
|
||||
```
|
||||
or
|
||||
|
||||
```text
|
||||
FROM satcomp-infrastructure:worker
|
||||
```
|
||||
|
||||
The base images (which are based on ubuntu:20.04) have predefined entrypoints that will start when the container is invoked. These entrypoints handle all interaction with the competition infrastructure. As a result, you are only responsible for invoking your solver; we expect that you will not have to write code for most of the interactions with AWS resources.
|
||||
|
||||
### Leader Base Container
|
||||
|
||||
The Leader Base Container sets up the SAT/SMT problem for the solver. It loads the SAT/SMT problem, then attempts to run an executable file in the docker image in the `/competition/solver` directory. It provides a single argument to this file, which is a directory path containing a file called `input.json`.
|
||||
|
||||
Here is an example of the `input.json` file for cloud mallob:
|
||||
|
||||
```text
|
||||
{
|
||||
"formula_file": "/mount/efs/problem.cnf",
|
||||
"formula_language": "DIMACS",
|
||||
"solver_argument_list": ["-p", "another_arg"],
|
||||
"timeout_seconds": 10,
|
||||
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
|
||||
}
|
||||
```
|
||||
|
||||
where:
|
||||
* `formula_file` is the path to the SAT/SMT problem to be solved.
|
||||
* `formula_language` is the encoding of the problem (currently we use DIMACS for SAT-Comp and SMTLIB2 for SMT-Comp). This field is optional and can be ignored by the solver.
|
||||
* `solver_argument_list` allows passthrough of arguments to the solver. This allows you to try different strategies without rebuilding your docker container by varying the arguments.
|
||||
* `timeout_seconds` is the timeout for the solver. It will be enforced by the infrastructure; a solver that doesn't complete within the timeout will be terminated.
|
||||
* `worker_node_ips` is unchanged; for cloud solvers, it is the list of worker nodes. For parallel solvers this field will always be the empty list.
|
||||
|
||||
For the interactive example in the first section, this file is generated by the [`run_parallel.sh`](runner/run_parallel.sh) script.
|
||||
|
||||
The solver is then expected to run and generate a file called `solver_out.json`.
|
||||
Here is an example of `solver_out.json` from distributed mallob:
|
||||
|
||||
```text
|
||||
{
|
||||
"return_code": int, // The return code for the solver.
|
||||
"artifacts": {
|
||||
"stdout_path": String, // Where to find the stdout of the solve run
|
||||
"stderr_path": String // Where to find the stderr of the solve run
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
where:
|
||||
* `return_code` is the return code for the solver. **N.B.**: The return_code in `solver_out.json` will determine the satisfiability of the problem for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
|
||||
* `stdout_path` is the path to the stdout log.
|
||||
* `stderr_path` is the path to the stderr log.
|
||||
|
||||
You will adapt the Mallob `solver` script to invoke your solver and coordinate with Worker nodes. You should extend the competition base image by copying your `solver` script to a new container that is based on the Competition Base Container.
|
||||
|
||||
For example:
|
||||
|
||||
```text
|
||||
FROM satcomp-base:leader
|
||||
|
||||
# Do all steps necessary to build your solver
|
||||
RUN apt-get install my-amazing-solver
|
||||
|
||||
# Add your solver script
|
||||
COPY solver /competition/solver
|
||||
RUN chmod +x /competition/solver
|
||||
```
|
||||
|
||||
Consult the Mallob [leader Dockerfile](mallob-images/leader/Dockerfile) for a more complete example. Note that you should not provide a docker entrypoint or a CMD because the entrypoint is specified by the base container.
|
||||
|
||||
### Worker Base Container
|
||||
|
||||
Workers are expected to be controllable by the leader node using `ssh`. For workers, competitors have three responsibilities:
|
||||
1. To report their status to the AWS infrastructure once per second, to determine whether work can be allocated to them.
|
||||
2. To clean up when the leader completes a problem (or terminates with an error).
|
||||
3. To assist with the solving process using ssh.
|
||||
|
||||
#### Reporting Status
|
||||
|
||||
To accomplish the first task, the infrastructure assumes that there is an executable at the location `/competition/worker` that will report status once per second. Status reporting is done by updating a file at location `/competition/worker_node_status.json`. The file contains a current status (one of {READY, BUSY, ERROR}), and a timestamp. The timestamp is the unix epoch time as returned by the linux time() function. If the worker fails to update this status file for more than 5 seconds, the worker node will be considered failed and the infrastructure will reset the node. Repeated failures of worker nodes (likely to be a maximum of three) will cause the system to score the problem as an 'error'.
|
||||
|
||||
Here is an example of the format for `worker_node_status.json`:
|
||||
|
||||
```text
|
||||
{
|
||||
"status": "READY", // one of {"READY", "BUSY", "ERROR"}
|
||||
"timestamp": "1644545117" // linux epoch time as returned by the C time() function
|
||||
}
|
||||
```
|
||||
|
||||
The Mallob worker script is a 'bare-bones' example of status reporting that simply checks the status of the worker process and sshd. See the [worker](mallob-images/worker/worker) for details.
|
||||
|
||||
#### Cleaning up
|
||||
|
||||
You are also responsible for writing a worker `cleanup` script that is executed when a solving task is complete. The infrastructure will ensure that this file is called after the leader completes on a SAT/SMT problem, or after the leader fails in some way. We add this capability to make it straightforward to kill solver nodes that are still working on subproblems when the leader node completes. Given that the leader can also communicate with the worker using SSH, this file is not strictly necessary depending on how the system is architected (i.e., if the leader can gracefully and quickly terminate all workers at the end of solving, and the leader itself does not fail).
|
||||
|
||||
In normal operation, this script is executed by the infrastructure when the leader process completes. It will also be executed when a failure occurs. The script must reset the worker state (the node will remain) so that it is ready to solve the next problem.
|
||||
|
||||
For example:
|
||||
|
||||
```text
|
||||
FROM satcomp-base:worker
|
||||
|
||||
# Do all steps necessary to build your solver
|
||||
RUN apt-get install my-amazing-solver
|
||||
|
||||
# Add your solver script
|
||||
COPY worker /competition/worker
|
||||
RUN chmod +x /competition/worker
|
||||
|
||||
COPY cleanup /competition/cleanup
|
||||
RUN chmod +x /competition/cleanup
|
||||
```
|
||||
|
||||
See the Mallob worker [Dockerfile](satcomp-images/satcomp-worker/Dockerfile) for more details.
|
||||
|
||||
#### Assisting with the Solving Process
|
||||
|
||||
There are many ways to accomplish this goal, and to some degree this is the critical part of building a distributed solver. In past years, many competitors have used [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) to structure the communications between the leader and worker nodes (Mallob is one example that uses MPI), but competitors have the freedom to structure communication in whatever way makes the most sense for their solver. Worker nodes are not passed a representation of the SAT/SMT problem by the infrastructure: instead the leader must communicate the problem (or a portion of it) to the worker.
|
||||
|
||||
## Next step: Building / Running on AWS Infrastructure
|
||||
After building your solvers, please test them using Docker, as we did with Mallob in this README. Once you have built your solver and you are confident that it works properly using Docker, then you are ready to try it out at larger scale on AWS infrastructure. Next, please navigate to [Infrastructure Readme](../infrastructure/README-Infrastructure.md), where the process of uploading, running, and debugging your docker images on AWS is described.
|
||||
|
||||
|
||||
## FAQ / Troubleshooting
|
||||
|
||||
### Q: What is the build_mallob_images.sh script doing? What do the flags for docker build mean?
|
||||
|
||||
The script looks like this:
|
||||
|
||||
# /bin/sh
|
||||
cd common
|
||||
docker build -t satcomp-mallob:common .
|
||||
cd ..
|
||||
|
||||
cd leader
|
||||
docker build -t satcomp-mallob:leader .
|
||||
cd ..
|
||||
|
||||
cd worker
|
||||
docker build -t satcomp-mallob:worker .
|
||||
cd ..
|
||||
|
||||
First we factor out the common parts of the leader and worker into a common image. This Dockerfile builds the Mallob solver and its components that will be used by both the leader and the worker image. Then we use this common image to build the leader and worker images.
|
||||
|
||||
Docker has two characteristics for build that are important for this script. The `-t satcomp-mallob:common` argument tells Docker to give the newly constructed image a _tag_ that provides a name for the docker image. The second thing is that all file paths in Docker are local to the directory where the `docker build` command occurs. This is why we need to navigate to the directory containing the Dockerfile prior to running the build.
|
||||
|
||||
Note that there is also a `nocache_build_mallob_images.sh` script. Although Docker does a pretty good job of dependency tracking, certain changes are invisible to it. For example, Docker can determine whether there has been an update to the Mallob repository, but if Mallob calls out to another repository, then Docker will not detect it, and you might get a stale build. To guard against this, there is a `--no-cache` flag that will do a full rebuild. If your solver has similar indirect dependencies, you might need to use the `--no-cache` flag.
|
||||
|
||||
#### To build the Mallob leader image:
|
||||
|
||||
1. Navigate to the `leader` subdirectory.
|
||||
2. Run `docker build -t satcomp-mallob:leader .` The resulting image will be named `satcomp-mallob:leader`. This Dockerfile adds scripts necessary to use Mallob as the leader node in a distributed solver to the generated image.
|
||||
|
||||
#### To build the Mallob worker image:
|
||||
|
||||
1. Navigate to the `worker` subdirectory.
|
||||
2. Run `docker build -t satcomp-mallob:worker .` This Dockerfile adds scripts necessary to use Mallob as the worker node in a distributed solver to the generated image.
|
||||
|
||||
|
||||
### Q: Suppose I want to change the directory or network name for the run_parallel and run_cloud scripts. How do I do this?
|
||||
|
||||
A: The two variables to change are:
|
||||
- `DOCKER_NETWORK`. Name of the docker bridge network. The default is `mallob-test`, which will work with the `network create` command in the previous section.
|
||||
- `HOST_RUNDIR`. Name of the host directory that will be mounted in the docker run directory. _Note: This should be an absolute pathname and it must be updated for your filesystem._ This directory will be mounted in the docker container and enables you to persist information over multiple docker container sessions. We have placed an `experiment` directory inside `runner` that you can use to begin.
|
||||
|
||||
These are both documented in the script files.
|
||||
|
||||
|
||||
### Q: After a while, I have lots of old versions of Docker images that are taking up disk space that I no longer need. How do I get rid of them?
|
||||
|
||||
A: On repeated image builds, previously-built images with the same name will be left with the name/tag as `<none>/<none>`. Docker dangling images can be deleted with `docker image prune`. A specific image can be deleted by running `docker rmi <IMAGE ID>`.
|
||||
|
||||
Note that you can delete all docker images on your machine by running `docker rmi -f $(docker images -a -q)`. Be careful: only do this after running `docker images` to check that you won't delete images unrelated to the solver competition.
|
||||
|
||||
### Q: I'm only submitting to the parallel track and not the cloud track. Do I need a worker image?
|
||||
|
||||
A: No. If you are submitting to the parallel track only, you do not need a worker image. For the parallel track, we will assume that the leader manages all threading and communications within the single (multi-core) compute node.
|
@ -0,0 +1,12 @@
|
||||
# /bin/sh
|
||||
cd common
|
||||
docker build -t satcomp-mallob:common .
|
||||
cd ..
|
||||
|
||||
cd leader
|
||||
docker build -t satcomp-mallob:leader .
|
||||
cd ..
|
||||
|
||||
cd worker
|
||||
docker build -t satcomp-mallob:worker .
|
||||
cd ..
|
@ -0,0 +1,13 @@
|
||||
################### Build Mallob
|
||||
FROM satcomp-infrastructure:common
|
||||
USER root
|
||||
# Install required softwares
|
||||
RUN apt-get update \
|
||||
&& DEBIAN_FRONTEND=noninteractive apt install -y cmake build-essential zlib1g-dev libopenmpi-dev wget unzip build-essential zlib1g-dev cmake python3 build-essential gfortran wget curl
|
||||
|
||||
# Build Mallob
|
||||
RUN git clone https://github.com/domschrei/mallob && cd mallob && git checkout interface && git checkout 35794bfa9d650f7c465c08cd0073a71833119162 && cd ..
|
||||
WORKDIR mallob
|
||||
RUN cd lib && bash fetch_and_build_sat_solvers.sh klcy && cd ..
|
||||
RUN mkdir build
|
||||
RUN cd build && cmake -DCMAKE_BUILD_TYPE=RELEASE -DMALLOB_USE_RESTRICTED=1 .. && VERBOSE=1 make && cd ..
|
@ -0,0 +1,20 @@
|
||||
################### Use Mallob
|
||||
FROM satcomp-mallob:common AS builder
|
||||
USER root
|
||||
|
||||
|
||||
################### Extract Mallob in run stage
|
||||
FROM satcomp-infrastructure:leader AS mallob_liaison
|
||||
WORKDIR /
|
||||
# Copy mallob and solver scripts
|
||||
COPY --from=builder /mallob/build/mallob mallob
|
||||
COPY --from=builder /mallob/build/mallob_sat_process mallob_sat_process
|
||||
COPY --from=builder /mallob/build/mallob_process_dispatcher mallob_process_dispatcher
|
||||
|
||||
COPY --chown=ecs-user /init_solver.sh /competition
|
||||
COPY --chown=ecs-user /run_solver.sh /competition
|
||||
COPY --chown=ecs-user /solver /competition
|
||||
USER ecs-user
|
||||
RUN chmod +x /competition/init_solver.sh
|
||||
RUN chmod +x /competition/run_solver.sh
|
||||
RUN chmod +x /competition/solver
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# start sshd
|
||||
/usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config &
|
||||
|
||||
# run solver
|
||||
/competition/solver /rundir
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
export OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
export RDMAV_FORK_SAFE=1
|
||||
mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --hostfile $1 --bind-to none mallob -mono=$2 \
|
||||
-zero-only-logging -sleep=1000 -t=4 -appmode=fork -nolog -v=3 -interface-fs=0 -trace-dir=competition \
|
||||
-pipe-large-solutions=0 -processes-per-host=$processes_per_host -regular-process-allocation \
|
||||
-max-lits-per-thread=50000000 -strict-clause-length-limit=20 -clause-filter-clear-interval=500 \
|
||||
-max-lbd-partition-size=2 -export-chunks=20 -clause-buffer-discount=1.0 -satsolver=k
|
||||
|
115
aws-batch-comp-infrastructure-sample/docker/mallob-images/leader/solver
Executable file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
class Runner:
|
||||
def __init__(self, request_directory: str):
|
||||
self.logger = logging.getLogger("Runner")
|
||||
self.logger.setLevel(logging.INFO)
|
||||
self.request_directory = request_directory
|
||||
os.environ['PYTHONUNBUFFERED'] = "1"
|
||||
|
||||
def process_stream(self, stream, str_name, file_handle):
|
||||
line = stream.readline()
|
||||
while line != "":
|
||||
self.logger.info(f"{str_name}: {line}")
|
||||
file_handle.write(line)
|
||||
line = stream.readline()
|
||||
|
||||
def run(self, cmd: list):
|
||||
self.logger.info("Running command: %s", str(cmd))
|
||||
|
||||
stdout_target_loc = os.path.join(self.request_directory, "stdout.log")
|
||||
stderr_target_loc = os.path.join(self.request_directory, "stderr.log")
|
||||
|
||||
with open(stdout_target_loc, "w") as stdout_handle:
|
||||
with open(stderr_target_loc, "w") as stderr_handle:
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
universal_newlines=True, start_new_session=True)
|
||||
stdout_t = threading.Thread(target = self.process_stream, args=(proc.stdout, "STDOUT", stdout_handle))
|
||||
stderr_t = threading.Thread(target = self.process_stream, args=(proc.stderr, "STDERR", stderr_handle))
|
||||
stdout_t.start()
|
||||
stderr_t.start()
|
||||
return_code = proc.wait()
|
||||
stdout_t.join()
|
||||
stderr_t.join()
|
||||
|
||||
return {
|
||||
"stdout": stdout_target_loc,
|
||||
"stderr": stderr_target_loc,
|
||||
"return_code": return_code,
|
||||
"output_directory": self.request_directory
|
||||
}
|
||||
|
||||
def get_input_json(self):
|
||||
input = os.path.join(self.request_directory, "input.json")
|
||||
with open(input) as f:
|
||||
return json.loads(f.read())
|
||||
|
||||
def create_hostfile(self,ips, request_dir):
|
||||
hostfile_path = os.path.join(request_dir, 'combined_hostfile')
|
||||
with open(hostfile_path, 'w+') as f:
|
||||
for ip in ips:
|
||||
f.write(f'{ip} slots=4') # distributed default
|
||||
#f.write(f'{ip} slots=16') # parallel default
|
||||
f.write('\n')
|
||||
return hostfile_path
|
||||
|
||||
def get_command(self, input_json):
|
||||
formula_file = input_json.get("formula_file")
|
||||
worker_node_ips = input_json.get("worker_node_ips", [])
|
||||
|
||||
combined_hostfile = self.create_hostfile(worker_node_ips, self.request_directory)
|
||||
|
||||
run_list = ["/competition/run_solver.sh"]
|
||||
run_list.append(combined_hostfile)
|
||||
run_list.append(formula_file)
|
||||
|
||||
return run_list
|
||||
|
||||
|
||||
class MallobParser:
|
||||
@staticmethod
|
||||
def get_result(output_file):
|
||||
"""
|
||||
TODO: Participants should replace this with something more robust for their own solver!
|
||||
"""
|
||||
with open(output_file) as f:
|
||||
raw_logs = f.read()
|
||||
if "found result UNSAT" in raw_logs:
|
||||
return 20
|
||||
elif "found result SAT" in raw_logs:
|
||||
return 10
|
||||
elif "UNKNOWN" in raw_logs:
|
||||
return 0
|
||||
else:
|
||||
return -1
|
||||
|
||||
if __name__ == "__main__":
|
||||
request_directory = sys.argv[1]
|
||||
runner = Runner(request_directory)
|
||||
|
||||
input_json = runner.get_input_json()
|
||||
cmd = runner.get_command(input_json)
|
||||
|
||||
print (f"cmd: {cmd}")
|
||||
output = runner.run(cmd)
|
||||
result = MallobParser.get_result(output["stdout"])
|
||||
logging.info(f"RESULT: {result}")
|
||||
solver_output = {
|
||||
"return_code": result,
|
||||
"artifacts": {
|
||||
"stdout_path": output["stdout"],
|
||||
"stderr_path": output["stderr"]
|
||||
}
|
||||
}
|
||||
|
||||
with open(os.path.join(request_directory, "solver_out.json"), "w+") as f:
|
||||
f.write(json.dumps(solver_output))
|
@ -0,0 +1,12 @@
|
||||
# /bin/sh
|
||||
cd common
|
||||
docker build --no-cache -t satcomp-mallob:common .
|
||||
cd ..
|
||||
|
||||
cd leader
|
||||
docker build --no-cache -t satcomp-mallob:leader .
|
||||
cd ..
|
||||
|
||||
cd worker
|
||||
docker build --no-cache -t satcomp-mallob:worker .
|
||||
cd ..
|
@ -0,0 +1,17 @@
|
||||
################### Use Mallob
|
||||
FROM satcomp-mallob:common AS builder
|
||||
USER root
|
||||
|
||||
################### Extract Mallob in run stage
|
||||
FROM satcomp-infrastructure:worker AS mallob_liaison
|
||||
WORKDIR /
|
||||
# Copy mallob
|
||||
COPY --from=builder /mallob/build/mallob mallob
|
||||
COPY --from=builder /mallob/build/mallob_sat_process mallob_sat_process
|
||||
COPY --from=builder /mallob/build/mallob_process_dispatcher mallob_process_dispatcher
|
||||
|
||||
COPY --chown=ecs-user /init_solver.sh /competition
|
||||
COPY --chown=ecs-user /worker /competition
|
||||
USER ecs-user
|
||||
RUN chmod +x /competition/init_solver.sh
|
||||
RUN chmod +x /competition/worker
|
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# start sshd
|
||||
/usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config &
|
||||
|
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import logging
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
def check_process_status():
|
||||
status = 'READY'
|
||||
output = subprocess.check_output(['ps', 'ax', '-ocomm']).decode("utf-8").strip()
|
||||
|
||||
if 'orted' in output:
|
||||
status = 'BUSY'
|
||||
|
||||
if 'sshd' not in output:
|
||||
status = 'ERROR'
|
||||
|
||||
return status
|
||||
|
||||
|
||||
def update_worker_status():
|
||||
while True:
|
||||
status = check_process_status()
|
||||
|
||||
data = {"status": status, "timestamp": int(time.time())}
|
||||
with open(
|
||||
"/competition/worker_node_status.json",
|
||||
"w+",
|
||||
) as statusfile:
|
||||
statusfile.write(json.dumps(data))
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_worker_status()
|
@ -0,0 +1,2 @@
|
||||
leader slots=4
|
||||
worker slots=4
|
4
aws-batch-comp-infrastructure-sample/docker/runner/experiment/test.cnf
Executable file
@ -0,0 +1,4 @@
|
||||
c
|
||||
p cnf 3 2
|
||||
1 -3 0
|
||||
2 3 -1 0
|
@ -0,0 +1,48 @@
|
||||
# /bin/sh
|
||||
|
||||
# run_dist_leader.sh: Use this script to launch Docker container
|
||||
# for parallel Mallob. Arguments:
|
||||
# $1: docker image name (assumes a "leader" tag)
|
||||
# $2: formula file (relative to HOST_RUNDIR)
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "run_parallel.sh: needs two arguments: <docker_image_name> <formula file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# user config
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
|
||||
|
||||
# script config
|
||||
NODE_TYPE="leader"
|
||||
DOCKER_RUNDIR="/rundir"
|
||||
|
||||
# summary
|
||||
echo "run_dist_worker.sh, running with"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $1:$NODE_TYPE"
|
||||
echo " formula file: $HOST_RUNDIR/$2"
|
||||
echo " host rundir: $HOST_RUNDIR"
|
||||
echo " docker rundir: $DOCKER_RUNDIR"
|
||||
|
||||
# does host directory exist?
|
||||
if [[ ! -d "$HOST_RUNDIR" ]]; then
|
||||
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# does formula file exist?
|
||||
if [[ ! -f "$HOST_RUNDIR/$2" ]]; then
|
||||
echo "ERROR - unable to read formula file '$HOST_RUNDIR/$2'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# create input.json
|
||||
echo -e "{\n \"formula_file\": \"$DOCKER_RUNDIR/$2\",\n \"worker_node_ips\": [\"leader\", \"worker\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"smt2\",\n \"solver_argument_list\": []\n}" > "$HOST_RUNDIR/input.json"
|
||||
|
||||
# Run docker image. See comments in run_parallel.sh
|
||||
#
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"
|
@ -0,0 +1,33 @@
|
||||
# /bin/sh
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "configure_parallel.sh: needs one argument: <docker_image_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# user config
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
|
||||
|
||||
# config to match other scripts
|
||||
NODE_TYPE="worker"
|
||||
DOCKER_RUNDIR="/rundir"
|
||||
|
||||
# summary
|
||||
echo "run_dist_worker.sh, running with"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $1:$NODE_TYPE"
|
||||
echo " host rundir: $HOST_RUNDIR"
|
||||
echo " docker rundir: $DOCKER_RUNDIR"
|
||||
|
||||
# does host directory exist?
|
||||
if [[ ! -d "$HOST_RUNDIR" ]]; then
|
||||
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run docker image. See comments in run_parallel.sh
|
||||
#
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"
|
@ -0,0 +1,60 @@
|
||||
# /bin/bash
|
||||
|
||||
# run_parallel.sh: Use this script to launch Docker container
|
||||
# for parallel Mallob. Arguments:
|
||||
# $1: docker image name (assumes a "leader" tag)
|
||||
# $2: formula file (relative to HOST_RUNDIR)
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "run_parallel.sh: needs two arguments: <docker_image_name> <formula file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# user config
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
|
||||
|
||||
# config to match other scripts
|
||||
NODE_TYPE="leader"
|
||||
DOCKER_RUNDIR="/rundir"
|
||||
|
||||
# summary
|
||||
echo "run_parallel.sh, running with"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $1:$NODE_TYPE"
|
||||
echo " formula file: $HOST_RUNDIR/$2"
|
||||
echo " host rundir: $HOST_RUNDIR"
|
||||
echo " docker rundir: $DOCKER_RUNDIR"
|
||||
|
||||
# does host directory exist?
|
||||
if [[ ! -d "$HOST_RUNDIR" ]]; then
|
||||
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# does formula file exist?
|
||||
if [[ ! -f "$HOST_RUNDIR/$2" ]]; then
|
||||
echo "ERROR - unable to read formula file '$HOST_RUNDIR/$2'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#
|
||||
# create input.json
|
||||
#
|
||||
echo -e "{\n\"formula_file\": \"$DOCKER_RUNDIR/$2\",\n\"worker_node_ips\": [\"leader\"]\n}" > "$HOST_RUNDIR/input.json"
|
||||
|
||||
#
|
||||
# Run docker image as container. Arguments:
|
||||
# -i // keep STDIN open even if not attached
|
||||
# --shm-size=XXg // setting up shared memory (for clause DB)
|
||||
# // default (1G?) is too small
|
||||
# --network mallob-test // use the Docker bridge network called 'mallob-test'
|
||||
# --entrypoint bash // when the container starts, it runs bash
|
||||
# --rm // remove the docker container when exiting
|
||||
# -v // mount $HOST_RUNDIR in the host filesystem at '/$DOCKER_RUNDIR' in docker image
|
||||
# -t // Docker image name for leader (script hard-codes 'leader' tag)
|
||||
# -c <bash_command> // gets passed to bash shell
|
||||
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"
|
7
aws-batch-comp-infrastructure-sample/docker/runner/input.json
Executable file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"formula_file": "/rundir/experiment/test.cnf",
|
||||
"worker_node_ips": ["leader", "worker"],
|
||||
"timeout_seconds": "1000",
|
||||
"formula_language": "",
|
||||
"solver_argument_list": [""]
|
||||
}
|
63
aws-batch-comp-infrastructure-sample/docker/runner/run_dist_leader.sh
Executable file
@ -0,0 +1,63 @@
|
||||
#!/bin/zsh
|
||||
|
||||
# run_dist_leader.sh: Use this script to launch Docker container
|
||||
# for parallel Mallob. Arguments:
|
||||
# $1: docker image name (assumes a "leader" tag)
|
||||
# $2: formula file (relative to HOST_RUNDIR)
|
||||
# $*: solver arguments (passed directly to solver)
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "run_dist_leader.sh usage: <docker_image_name> <formula_file> [solver arguments]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# read first two command-line arguments
|
||||
DOCKER_IMAGE_NAME=$1
|
||||
FORMULA_FILENAME=$2
|
||||
shift 2
|
||||
SOLVER_ARGS_SPACED=$*
|
||||
SOLVER_ARGS="\"${SOLVER_ARGS_SPACED// /\", \"}\""
|
||||
|
||||
# default config; user can replace DOCKER_NETWORK / HOST_RUNDIR if desired.
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
SCRIPTPATH=$(readlink -f "$0")
|
||||
SCRIPTDIR=$(dirname "$SCRIPTPATH")
|
||||
HOST_RUNDIR="$SCRIPTDIR"
|
||||
|
||||
# script config
|
||||
NODE_TYPE="leader"
|
||||
DOCKER_RUNDIR="/rundir"
|
||||
|
||||
DOCKER_IMAGE="$DOCKER_IMAGE_NAME:$NODE_TYPE"
|
||||
HOST_FORMULA_FILE="$HOST_RUNDIR/$FORMULA_FILENAME"
|
||||
DOCKER_FORMULA_FILE="$DOCKER_RUNDIR/$FORMULA_FILENAME"
|
||||
|
||||
# summary
|
||||
echo "run_dist_worker.sh, running with:"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $DOCKER_IMAGE"
|
||||
echo " formula file: $FORMULA_FILENAME"
|
||||
echo " host rundir: $HOST_RUNDIR"
|
||||
echo " docker rundir: $DOCKER_RUNDIR"
|
||||
|
||||
# does host directory exist?
|
||||
if [[ ! -d "$HOST_RUNDIR" ]]; then
|
||||
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# does formula file exist?
|
||||
if [[ ! -f "$HOST_FORMULA_FILE" ]]; then
|
||||
echo "ERROR - unable to read formula file '$HOST_FORMULA_FILE'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# create input.json, all extra command-line args get hoovered as solver args
|
||||
echo -e "{\n \"formula_file\": \"$DOCKER_FORMULA_FILE\",\n \"worker_node_ips\": [\"leader\", \"worker\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"\",\n \"solver_argument_list\": [$SOLVER_ARGS]\n}" > "$HOST_RUNDIR/input.json"
|
||||
|
||||
|
||||
# Run docker image. See comments in run_parallel.sh
|
||||
#
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:$DOCKER_RUNDIR -t $DOCKER_IMAGE -c "/competition/init_solver.sh; exec bash"
|
24
aws-batch-comp-infrastructure-sample/docker/runner/run_dist_worker.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/zsh
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "configure_parallel.sh: needs one argument: <docker_image_name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# default config; user can replace DOCKER_NETWORK if desired.
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
|
||||
|
||||
# config to match other scripts
|
||||
NODE_TYPE="worker"
|
||||
|
||||
# summary
|
||||
echo "run_dist_worker.sh, running with"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $1:$NODE_TYPE"
|
||||
|
||||
# Run docker image. See comments in run_parallel.sh
|
||||
#
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"
|
74
aws-batch-comp-infrastructure-sample/docker/runner/run_parallel.sh
Executable file
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
|
||||
# run_parallel.sh: Use this script to launch Docker container
|
||||
# for parallel Mallob. Arguments:
|
||||
# $1: docker image name (assumes a "leader" tag)
|
||||
# $2: formula file (relative to HOST_RUNDIR)
|
||||
# $*: solver arguments (passed directly to solver)
|
||||
|
||||
# check number arguments
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "run_parallel.sh usage: <docker_image_name> <formula_file> [solver arguments]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# read first two command-line arguments
|
||||
DOCKER_IMAGE_NAME=$1
|
||||
FORMULA_FILENAME=$2
|
||||
shift 2
|
||||
SOLVER_ARGS_SPACED=$*
|
||||
SOLVER_ARGS="\"${SOLVER_ARGS_SPACED// /\", \"}\""
|
||||
|
||||
# default config; user can replace DOCKER_NETWORK / HOST_RUNDIR if desired.
|
||||
DOCKER_NETWORK="mallob-test"
|
||||
SCRIPTPATH=$(readlink -f "$0")
|
||||
SCRIPTDIR=$(dirname "$SCRIPTPATH")
|
||||
HOST_RUNDIR="$SCRIPTDIR"
|
||||
|
||||
# config to match other scripts
|
||||
NODE_TYPE="leader"
|
||||
DOCKER_RUNDIR="/rundir"
|
||||
|
||||
DOCKER_IMAGE="$DOCKER_IMAGE_NAME:$NODE_TYPE"
|
||||
HOST_FORMULA_FILE="$HOST_RUNDIR/$FORMULA_FILENAME"
|
||||
DOCKER_FORMULA_FILE="$DOCKER_RUNDIR/$FORMULA_FILENAME"
|
||||
|
||||
# summary
|
||||
echo "run_parallel.sh, running with"
|
||||
echo " node type: $NODE_TYPE"
|
||||
echo " docker network: $DOCKER_NETWORK"
|
||||
echo " docker image: $DOCKER_IMAGE"
|
||||
echo " formula file: $FORMULA_FILENAME"
|
||||
echo " host rundir: $HOST_RUNDIR"
|
||||
echo " docker rundir: $DOCKER_RUNDIR"
|
||||
|
||||
# does host directory exist?
|
||||
if [[ ! -d "$HOST_RUNDIR" ]]; then
|
||||
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# does formula file exist?
|
||||
if [[ ! -f "$HOST_FORMULA_FILE" ]]; then
|
||||
echo "ERROR - unable to read formula file '$HOST_FORMULA_FILE'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#
|
||||
# create input.json
|
||||
#
|
||||
echo -e "{\n \"formula_file\": \"$DOCKER_FORMULA_FILE\",\n \"worker_node_ips\": [\"leader\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"\",\n \"solver_argument_list\": [$SOLVER_ARGS]\n}" > "$HOST_RUNDIR/input.json"
|
||||
|
||||
#
|
||||
# Run docker image as container. Arguments:
|
||||
# -i // keep STDIN open even if not attached
|
||||
# --shm-size=XXg // setting up shared memory (for clause DB)
|
||||
# // default (1G?) is too small
|
||||
# --network mallob-test // use the Docker bridge network called 'mallob-test'
|
||||
# --entrypoint bash // when the container starts, it runs bash
|
||||
# --rm // remove the docker container when exiting
|
||||
# -v // mount $HOST_RUNDIR in the host filesystem at '/$DOCKER_RUNDIR' in docker image
|
||||
# -t // Docker image name for leader (script hard-codes 'leader' tag)
|
||||
# -c <bash_command> // gets passed to bash shell
|
||||
|
||||
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:$DOCKER_RUNDIR -t $DOCKER_IMAGE -c "/competition/init_solver.sh; exec bash"
|
@ -0,0 +1 @@
|
||||
{"return_code": 10, "artifacts": {"stdout_path": "/rundir/stdout.log", "stderr_path": "/rundir/stderr.log"}}
|
@ -0,0 +1 @@
|
||||
Warning: Permanently added 'worker,172.18.0.2' (ECDSA) to the list of known hosts.
|
@ -0,0 +1,69 @@
|
||||
c
|
||||
c Mallob -- a parallel and distributed platform for job scheduling, load balancing, and SAT solving
|
||||
c Designed by P. Sanders and D. Schreiber 2018-2022
|
||||
c Developed by D. Schreiber 2019-2022
|
||||
c
|
||||
c 0.000 0 Program options: -0o=1 -J=1 -T=0 -ajpc=0 -ans=0 -appmode=fork -ba=4 -bicg=4 -c=1 -cbbs=1500 -cbdf=1 -cfci=500 -cg=1 -ch=0 -chaf=5 -checksums=0 -chstms=10 -cmp=0 -colors=0 -dc=0 -ddd=0 -delaymonkey=0 -derandomize=1 -evu=0 -fapii=0 -g=0 -gclls=0 -h=0 -hbbfs=10 -hubfs=9999999 -huca=-1 -interface-fs=0 -interface-ipc=0 -isp=0 -jc=4 -jcl=0 -jcup=0 -jjp=0 -jwl=0 -l=1 -latencymonkey=0 -ljpc=32 -mbfsd=4 -mbt=1000000 -mcips=10 -md=0 -mid=0 -mjps=0 -mlbdps=2 -mlpt=50000000 -mmpi=0 -mono=/rundir/experiment/test.cnf -nce=20 -os=0 -p=0.01 -phasediv=1 -pls=0 -pph=0 -q=0 -qcll=8 -qlbdl=2 -ril=0 -rpa=1 -rs=1 -rto=0 -s=1 -satsolver=k -scll=20 -seed=0 -sjd=0 -slbdl=30 -sleep=1000 -t=4 -trace-dir=competition -v=3 -w=-1 -wam=10000 -warmup=0 -watchdog=1 -wr=0 -y=1 -yield=0
|
||||
c 0.000 0 Mallob rls pid=18 on host d6efa65e5077
|
||||
c 0.000 0 8 workers, 1 clients
|
||||
c 0.126 0 Created client communicator
|
||||
c 0.225 0 Created worker communicator
|
||||
c 0.225 0 My bounce alternatives: 5 7 6 2
|
||||
c 0.225 0 Set up API at src/interface/api/api_connector.hpp
|
||||
c 0.225 0 <Janitor> tid=79
|
||||
c 0.235 0 <Reader> Starting
|
||||
c 0.297 0 Machine color 0 with 4 total workers (my rank: 0)
|
||||
c 0.297 0 I Mapping job "admin.mono-job.json" to internal ID #1
|
||||
c 0.297 0 <Reader> [T] Reading job #1 rev. 0 {/rundir/experiment/test.cnf} ...
|
||||
c 0.297 0 <Reader> [T] Initialized job #1 {/rundir/experiment/test.cnf} in 0.000s: 7 lits w/ separators, 0 assumptions
|
||||
c 0.298 0 Introducing job #1 rev. 0 : r.#1:0 rev. 0 <- [0] born=0.298 hops=0 epoch=-1 => [1]
|
||||
c 0.301 0 Scheduling r.#1:0 rev. 0 <- [0] born=0.298 hops=1 epoch=-1 on [3] (latency: 0.00327s)
|
||||
c 0.312 0 ADOPT r.#1:2 rev. 0 <- [3] born=0.336 hops=0 epoch=2 mode=1 <= [3]
|
||||
c 0.312 0 COMMIT #1 -> #1:2
|
||||
c 0.313 0 #1:2 growing: r.#1:5 rev. 0 <- [0] born=0.313 hops=0 epoch=2 => [6]
|
||||
c 0.313 0 #1:2 growing: r.#1:6 rev. 0 <- [0] born=0.313 hops=0 epoch=2 => [4]
|
||||
c 0.314 0 UNCOMMIT #1:2
|
||||
c 0.314 0 LOAD 1 (+#1:2)
|
||||
c 0.314 0 EXECUTE #1:2 <= [3]
|
||||
c 0.326 0 <#1> Mallob SAT engine rls pid=94
|
||||
c 0.326 0 <#1> S8.0 Diversifying 8
|
||||
c 0.326 0 <#1> S8.0 found result SAT for rev. 0
|
||||
c 0.326 0 <#1> S9.0 Diversifying 9
|
||||
c 0.326 0 <#1> S9.0 found result SAT for rev. 0
|
||||
c 0.326 0 <#1> S10.0 Diversifying 10
|
||||
c 0.327 0 <#1> S11.0 Diversifying 11
|
||||
c 0.327 0 <#1> S11.0 found result SAT for rev. 0
|
||||
c 0.327 0 <#1> S10.0 found result SAT for rev. 0
|
||||
c 0.337 0 LOAD 0 (-#1:2)
|
||||
c 0.337 0 #1:2 desires fulfilled=0.0000 latency={num:0 min:0.00000 med:0.00000 avg:0.00000 max:0.00000}
|
||||
c 0.337 0 <#1> END S8 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
|
||||
c 0.337 0 <#1> END S8 clenhist prod total:0 0
|
||||
c 0.337 0 <#1> END S8 clenhist digd total:0 0
|
||||
c 0.337 0 <#1> END S9 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
|
||||
c 0.337 0 <#1> END S9 clenhist prod total:0 0
|
||||
c 0.337 0 <#1> END S9 clenhist digd total:0 0
|
||||
c 0.337 0 <#1> END S10 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
|
||||
c 0.337 0 <#1> END S10 clenhist prod total:0 0
|
||||
c 0.337 0 <#1> END S10 clenhist digd total:0 0
|
||||
c 0.337 0 <#1> END S11 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
|
||||
c 0.337 0 <#1> END S11 clenhist prod total:0 0
|
||||
c 0.337 0 <#1> END S11 clenhist digd total:0 0
|
||||
c 0.337 0 <#1> END pps:12 dcs:8 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
|
||||
c 0.337 0 <#1> END exp:0/0 drp:0(0.000000) pflt:0 sflt:0
|
||||
c 0.337 0 <#1> clenhist prod total:0 0
|
||||
c 0.337 0 <#1> clenhist flfl total:0 0
|
||||
c 0.337 0 <#1> clenhist admt total:0 0
|
||||
c 0.337 0 <#1> clenhist drpd total:0 0
|
||||
c 0.337 0 <#1> clenhist dltd total:0 0
|
||||
c 0.337 0 <#1> clenhist retd total:0 0
|
||||
c 0.348 0 RESPONSE_TIME #1 0.049750 rev. 0
|
||||
c 0.348 0 SOLUTION #1 SAT rev. 0
|
||||
s SATISFIABLE
|
||||
v 1 2 3 0
|
||||
c 0.349 0 Received exit signal <= [0]
|
||||
c 0.349 0 Terminating.
|
||||
c 0.444 0 busytime=0.023
|
||||
c 0.544 0 STATS treegrowth_latencies num:0 min:0.000000 max:0.000000 med:0.000000 mean:0.000000
|
||||
c 0.545 0 STATS balancing_latencies num:0 min:0.000000 max:0.000000 med:0.000000 mean:0.000000
|
||||
c 0.545 0 <Reader> Stopping
|
||||
c 0.794 0 Exiting happily
|
@ -0,0 +1,4 @@
|
||||
# /bin/sh
|
||||
docker build -f satcomp-common/Dockerfile -t satcomp-infrastructure:common .
|
||||
docker build -f satcomp-leader/Dockerfile -t satcomp-infrastructure:leader .
|
||||
docker build -f satcomp-worker/Dockerfile -t satcomp-infrastructure:worker .
|
@ -0,0 +1,14 @@
|
||||
FROM ubuntu:20.04
|
||||
RUN useradd -ms /bin/bash ecs-user
|
||||
RUN apt-get update \
|
||||
&& DEBIAN_FRONTEND=noninteractive apt install -y python3-pip git openssh-server openssh-server iproute2 openmpi-bin openmpi-common iputils-ping \
|
||||
&& mkdir /var/run/sshd \
|
||||
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
|
||||
&& setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd \
|
||||
&& chown -R ecs-user /etc/ssh/ \
|
||||
&& su - ecs-user -c \
|
||||
'ssh-keygen -q -t rsa -f ~/.ssh/id_rsa -N "" \
|
||||
&& cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys \
|
||||
&& cp /etc/ssh/sshd_config ~/.ssh/sshd_config \
|
||||
&& sed -i "s/UsePrivilegeSeparation yes/UsePrivilegeSeparation no/g" ~/.ssh/sshd_config \
|
||||
&& printf "Host *\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config'
|
@ -0,0 +1,14 @@
|
||||
FROM satcomp-infrastructure:common
|
||||
RUN pip3 install flask waitress boto3 pytz polling2
|
||||
COPY --chown=ecs-user satcomp-solver-resources/ /opt/amazon/
|
||||
COPY --chown=ecs-user satcomp-leader/resources/solver /competition/solver
|
||||
COPY --chown=ecs-user satcomp-leader/resources/leader /competition/leader
|
||||
ENV PYTHONPATH=/opt/amazon:.:
|
||||
RUN chmod u+x /competition/solver
|
||||
RUN chmod u+x /competition/leader
|
||||
RUN ls /opt/amazon/arg_satcomp_solver_base
|
||||
RUN chmod u+x /opt/amazon/arg_satcomp_solver_base/leader_entrypoint.py
|
||||
RUN service ssh start
|
||||
USER ecs-user
|
||||
EXPOSE 22
|
||||
ENTRYPOINT /usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config & /opt/amazon/arg_satcomp_solver_base/leader_entrypoint.py
|
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
""" This is a test script that saves leader status to json file every second"""
|
||||
import json
|
||||
import time
|
||||
|
||||
|
||||
def update_leader_status():
|
||||
while True:
|
||||
data = {"status": "READY", "timestamp": int(time.time())}
|
||||
with open(
|
||||
"/competition/leader_node_status.json",
|
||||
"w",
|
||||
) as statusfile:
|
||||
statusfile.write(json.dumps(data))
|
||||
time.sleep(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_leader_status()
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
echo "This is a test! Here is what the log would look like"
|
||||
echo " \
|
||||
{
|
||||
\"return_code\": 0,
|
||||
\"artifacts\": {
|
||||
\"stdout_path\": \"$1/solver_stdout.log\",
|
||||
\"stderr_path\": \"$1/solver_stderr.log\"
|
||||
}
|
||||
}
|
||||
" > $1/solver_out.json
|
||||
|
||||
echo "This is the stdout log" >> $1/solver_stdout.log
|
||||
echo "This is the stderr log" >> $1/solver_stderr.log
|
@ -0,0 +1 @@
|
||||
# Implement your code here.
|
@ -0,0 +1,51 @@
|
||||
"""
|
||||
This module runs in the background of the Leader Base Container.
|
||||
It updates the manifest periodically.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import threading
|
||||
import uuid
|
||||
import time
|
||||
|
||||
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest, NodeType
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
|
||||
class LeaderTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class LeaderStatusChecker(threading.Thread):
|
||||
"""Thread that runs in the background publishing leader READY status & IP address by
|
||||
updating manifest every second"""
|
||||
|
||||
leader_poll_sleep_time = 1
|
||||
file_operations: FileOperations = FileOperations()
|
||||
|
||||
def __init__(self, node_manifest: DynamodbManifest):
|
||||
threading.Thread.__init__(self)
|
||||
self.node_manifest = node_manifest
|
||||
self.logger = logging.getLogger("LeaderPoller")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.setDaemon(True)
|
||||
|
||||
def update_manifest(self, uuid: str, local_ip_address: str, status: str):
|
||||
self.logger.info(f"Giving node heartbeat for node {uuid} with ip {local_ip_address}")
|
||||
self.node_manifest.register_node(str(uuid), local_ip_address, status, NodeType.LEADER.name)
|
||||
|
||||
# TODO: This should be updated so that the leader emits its status (just like the workers)
|
||||
# Perhaps for SAT-Comp 2023
|
||||
def run(self):
|
||||
local_ip_address = socket.gethostbyname(socket.gethostname())
|
||||
node_uuid = uuid.uuid4()
|
||||
self.logger.info(f"Registering leader node {node_uuid} with ip {local_ip_address}")
|
||||
while True:
|
||||
current_timestamp = int(time.time())
|
||||
|
||||
self.update_manifest(node_uuid, local_ip_address, "READY")
|
||||
|
||||
self.logger.info("Leader updated status")
|
||||
time.sleep(self.leader_poll_sleep_time)
|
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""This file is the main entrypoint for the Leader Node Base Container"""
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
from time import sleep
|
||||
|
||||
|
||||
sys.path.append('/opt/amazon/lib/python3.8/site-packages/')
|
||||
|
||||
from arg_satcomp_solver_base.solver.command_line_solver import CommandLineSolver
|
||||
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue
|
||||
from arg_satcomp_solver_base.poller.poller import Poller
|
||||
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
|
||||
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
|
||||
from arg_satcomp_solver_base.leader.leader import LeaderStatusChecker
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
QUEUE_NAME = os.getenv('SQS_QUEUE_NAME')
|
||||
OUTPUT_QUEUE_NAME = os.getenv('SQS_OUTPUT_QUEUE_NAME')
|
||||
SATCOMP_BUCKET_NAME = os.getenv('SATCOMP_BUCKET_NAME')
|
||||
|
||||
def run_leader(path):
|
||||
cmd = os.path.join(path, "leader")
|
||||
stdout = os.path.join(path, "leader_stdout.log")
|
||||
stderr = os.path.join(path, "leader_stderr.log")
|
||||
with open(stdout, "wb") as out, open(stderr, "wb") as err:
|
||||
subprocess.Popen(cmd, stdout=out, stderr=err)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger = logging.getLogger("Leader Entrypoint")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
dir = "/competition"
|
||||
logger.info("Running leader healthcheck")
|
||||
run_leader(dir) # Run script to save status to leader_node_status.json
|
||||
sleep(1) # Wait for creation of leader_node_status.json file
|
||||
|
||||
logger.info("Getting input queue: %s", QUEUE_NAME)
|
||||
sqs_input_queue = SqsQueue.get_sqs_queue(QUEUE_NAME)
|
||||
|
||||
logger.info("Getting output queue: %s", QUEUE_NAME)
|
||||
sqs_output_queue = SqsQueue.get_sqs_queue(OUTPUT_QUEUE_NAME)
|
||||
|
||||
logger.info(f"Bucket name: {SATCOMP_BUCKET_NAME}")
|
||||
|
||||
logger.info("Getting task end notifier")
|
||||
task_end_notifier = TaskEndNotifier.get_task_end_notifier()
|
||||
|
||||
logger.info("Getting node manifest")
|
||||
node_manifest = DynamodbManifest.get_dynamodb_manifest()
|
||||
|
||||
logger.info("Building command line solver to run solver script /competition/solver")
|
||||
|
||||
solver = CommandLineSolver(f"{dir}/solver")
|
||||
|
||||
logger.info("Getting local IP address")
|
||||
local_ip_address = socket.gethostbyname(socket.gethostname())
|
||||
|
||||
logger.info("Starting leader status checker")
|
||||
leader_status = LeaderStatusChecker(node_manifest)
|
||||
leader_status.start()
|
||||
|
||||
logger.info("starting poller")
|
||||
poller = Poller(1, local_ip_address, sqs_input_queue, sqs_output_queue, node_manifest, task_end_notifier, solver, SATCOMP_BUCKET_NAME)
|
||||
poller.start()
|
||||
poller.join()
|
@ -0,0 +1,97 @@
|
||||
import logging
|
||||
import time
|
||||
from enum import Enum
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
class NodeStatus(Enum):
|
||||
READY = "READY"
|
||||
BUSY = "BUSY"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class NodeType(Enum):
|
||||
WORKER = "WORKER"
|
||||
LEADER = "LEADER"
|
||||
|
||||
|
||||
class DynamodbManifest:
|
||||
"""
|
||||
Node manifest for storing IP and status of satcomp nodes based on a DynamoDB Table
|
||||
"""
|
||||
def __init__(self, table, node_expiration_time=120):
|
||||
self.table = table
|
||||
self.node_expiration_time = node_expiration_time
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def register_node(self, node_id: str, ip_address: str, status: str, node_type: str):
|
||||
"""
|
||||
Register node with node manifest table
|
||||
:param node_id: Unique ID for this node
|
||||
:param ip_address: IP address for this node
|
||||
:param status: Current status of Node
|
||||
:param node_type: Node type, either WORKER or LEADER
|
||||
:return:
|
||||
"""
|
||||
current_time = int(time.time())
|
||||
self.table.update_item(
|
||||
Key={
|
||||
'nodeId': node_id
|
||||
},
|
||||
UpdateExpression="set nodeIp = :ipVal, #stskey = :stsVal, nodeType = :nodeTypeVal,"
|
||||
" lastModified = :lastModifiedVal",
|
||||
ExpressionAttributeNames={
|
||||
"#stskey": "status"
|
||||
},
|
||||
ExpressionAttributeValues={
|
||||
":ipVal": ip_address,
|
||||
":stsVal": status,
|
||||
":nodeTypeVal": node_type,
|
||||
":lastModifiedVal": current_time
|
||||
}
|
||||
)
|
||||
|
||||
def get_all_ready_nodes(self, nodeTypeVal):
|
||||
"""
|
||||
Returns all nodes of type nodeTypeVal that are currently set to READY status
|
||||
:return:
|
||||
"""
|
||||
expiration_time = int(time.time() - self.node_expiration_time)
|
||||
nodes = self.table.scan(
|
||||
Select="ALL_ATTRIBUTES",
|
||||
FilterExpression="#nodeStatusKey = :nodeStatusVal and #nodeTypeKey = :nodeTypeVal and "
|
||||
"#lastModifiedKey > :lastModifiedVal",
|
||||
ExpressionAttributeNames={
|
||||
"#nodeStatusKey": "status",
|
||||
"#nodeTypeKey": "nodeType",
|
||||
"#lastModifiedKey": "lastModified"
|
||||
},
|
||||
ExpressionAttributeValues={
|
||||
":nodeStatusVal": "READY",
|
||||
":nodeTypeVal": nodeTypeVal,
|
||||
":lastModifiedVal": expiration_time
|
||||
},
|
||||
ConsistentRead=True
|
||||
)
|
||||
return nodes["Items"]
|
||||
|
||||
def get_all_ready_worker_nodes(self):
|
||||
"""
|
||||
Returns all worker nodes that are currently set to READY status
|
||||
:return:
|
||||
"""
|
||||
return self.get_all_ready_nodes("WORKER")
|
||||
|
||||
def get_all_ready_leader_nodes(self):
|
||||
"""
|
||||
Returns a list of zero or one leader node currently set to READY status
|
||||
:return:
|
||||
"""
|
||||
return self.get_all_ready_nodes("LEADER")
|
||||
|
||||
@staticmethod
|
||||
def get_dynamodb_manifest():
|
||||
dynamodb_resource = boto3.resource("dynamodb")
|
||||
table = dynamodb_resource.Table("SatCompNodeManifest")
|
||||
return DynamodbManifest(table)
|
@ -0,0 +1,214 @@
|
||||
"""
|
||||
This module is a Poller which will run in the background of the satcomp Base Container.
|
||||
It will poll an SQS queue looking for problems to solve and send them to solver implementations.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
from enum import Enum
|
||||
from json import JSONDecodeError
|
||||
from time import sleep
|
||||
import os
|
||||
|
||||
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
|
||||
from arg_satcomp_solver_base.s3_file_system.s3_file_system import S3FileSystem, S3FileSystemException
|
||||
from arg_satcomp_solver_base.solver.command_line_solver import Solver, SolverException
|
||||
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue, SqsQueueException
|
||||
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
MOUNT_POINT = '/tmp'
|
||||
|
||||
|
||||
class PollerStatus(Enum):
|
||||
HEALTHY = "HEALTHY"
|
||||
|
||||
|
||||
class PollerTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Poller(threading.Thread):
|
||||
"""Thread that runs in the background trying to pull work off of an SQS
|
||||
queue and submitting that work to a solver"""
|
||||
worker_poll_timeout = 120
|
||||
worker_poll_sleep_time = 5
|
||||
queue: SqsQueue
|
||||
output_queue: SqsQueue
|
||||
node_manifest: DynamodbManifest
|
||||
solver: Solver
|
||||
thread_id: int
|
||||
ip_address: str
|
||||
file_operations: FileOperations = FileOperations()
|
||||
health_status: PollerStatus
|
||||
s3_file_system: S3FileSystem = S3FileSystem.get_s3_file_system()
|
||||
s3_bucket: str
|
||||
task_end_notifier: TaskEndNotifier
|
||||
|
||||
def __init__(self, thread_id: int,
|
||||
ip_address: str,
|
||||
queue: SqsQueue,
|
||||
output_queue: SqsQueue,
|
||||
node_manifest: DynamodbManifest,
|
||||
task_end_notifier: TaskEndNotifier,
|
||||
solver: Solver,
|
||||
s3_bucket: str):
|
||||
threading.Thread.__init__(self)
|
||||
self.queue = queue
|
||||
self.output_queue = output_queue
|
||||
self.node_manifest = node_manifest
|
||||
self.task_end_notifier = task_end_notifier
|
||||
self.solver = solver
|
||||
self.thread_id = thread_id
|
||||
self.ip_address = ip_address
|
||||
self.logger = logging.getLogger("Poller")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.health_status = PollerStatus.HEALTHY
|
||||
self.s3_bucket = s3_bucket
|
||||
|
||||
def _is_valid_solver_request(self, solver_request):
|
||||
# TODO improve message validation
|
||||
# Expected message structure:
|
||||
"""{
|
||||
"formula" : {
|
||||
"value" : <s3 url>,
|
||||
"language": "SMTLIB2" | "DIMACS",
|
||||
},
|
||||
"solverConfig" : {
|
||||
"solverName" : "",
|
||||
"solverOptions" : [],
|
||||
"taskTimeoutSeconds" : 5
|
||||
},
|
||||
"num_workers": 0
|
||||
}"""
|
||||
""" {
|
||||
"formula": {
|
||||
"value": "s3://834251193136-us-east-1-my-project/test.cnf",
|
||||
"language": ""
|
||||
},
|
||||
"solverConfig": {
|
||||
"solverName": "",
|
||||
"solverOptions": [],
|
||||
"taskTimeoutSeconds": 10
|
||||
},
|
||||
"num_workers": 0
|
||||
}
|
||||
"""
|
||||
if solver_request is None:
|
||||
return False
|
||||
if "formula" not in solver_request:
|
||||
return False
|
||||
if "language" not in solver_request["formula"]:
|
||||
return False
|
||||
if "value" not in solver_request["formula"]:
|
||||
return False
|
||||
if "solverConfig" not in solver_request:
|
||||
return False
|
||||
if "solverName" not in solver_request["solverConfig"]:
|
||||
return False
|
||||
if "solverOptions" not in solver_request["solverConfig"]:
|
||||
return False
|
||||
if "taskTimeoutSeconds" not in solver_request["solverConfig"]:
|
||||
return False
|
||||
if "num_workers" not in solver_request:
|
||||
return False
|
||||
return True
|
||||
|
||||
def run(self):
|
||||
# TODO: In the future we should make this more testable by separating the while loop
|
||||
# from the thread code itself
|
||||
# TODO: How do we want to handle client errors/internal server errors
|
||||
while True:
|
||||
# TODO: For now we are only handling problems that will fit in SQS message.
|
||||
# In the future this will have to change
|
||||
leader_working_directory = None
|
||||
efs_shared_directory = None
|
||||
try:
|
||||
self.logger.info("Trying to get messages from queue: %s", self.queue.queue_name)
|
||||
message = self.queue.get_message()
|
||||
if message is None:
|
||||
continue
|
||||
|
||||
msg_json = json.loads(message.read())
|
||||
message_handle = message.msg.receipt_handle
|
||||
self.logger.info("Got problem to solve from message with receipt handle: %s", message_handle)
|
||||
|
||||
message.delete()
|
||||
self.logger.info("Deleted message from queue")
|
||||
|
||||
if not self._is_valid_solver_request(msg_json):
|
||||
self.logger.error(f"Message {message.read()} is invalid. Skipping processing.")
|
||||
continue
|
||||
|
||||
s3_uri = msg_json.get("formula").get("value")
|
||||
timeout = msg_json.get("solverConfig").get("taskTimeoutSeconds")
|
||||
formula_language = msg_json.get("formula").get("language")
|
||||
solver_options = msg_json.get("solverConfig").get("solverOptions")
|
||||
num_workers = msg_json.get("num_workers", 0)
|
||||
|
||||
self.logger.info("Waiting for worker nodes to come up")
|
||||
self.logger.info(f"Task requests {num_workers} worker nodes")
|
||||
workers = self.wait_for_worker_nodes(num_workers)
|
||||
workers.append({"nodeIp": self.ip_address})
|
||||
|
||||
task_uuid = self.file_operations.generate_uuid()
|
||||
efs_uuid_directory = self.file_operations.create_custom_directory(MOUNT_POINT, task_uuid)
|
||||
self.logger.info("Created uuid directory in local container %s", efs_uuid_directory)
|
||||
download_location = self.s3_file_system.download_file(s3_uri, efs_uuid_directory)
|
||||
self.logger.info("Download problem to location: %s", download_location)
|
||||
|
||||
solver_response = self.solver.solve(download_location, efs_uuid_directory, workers, task_uuid, timeout, formula_language, solver_options)
|
||||
solver_response["driver"]["s3_uri"] = s3_uri
|
||||
self.logger.info("Solver response:")
|
||||
self.logger.info(solver_response)
|
||||
|
||||
self.logger.info("Writing response to output queue")
|
||||
self.output_queue.put_message(json.dumps(solver_response))
|
||||
|
||||
self.logger.info(f"Writing all files in request directory path: {efs_uuid_directory} to S3 bucket: {self.s3_bucket}")
|
||||
s3_uri = "s3://" + self.s3_bucket + efs_uuid_directory
|
||||
self.logger.info(f"S3 URI is: {s3_uri} and S3 bucket is: {self.s3_bucket}")
|
||||
self.s3_file_system.upload_directory_tree(efs_uuid_directory, s3_uri)
|
||||
|
||||
self.logger.info(
|
||||
"Cleaning up solver output directory %s",
|
||||
solver_response["solver"].get("request_directory_path")
|
||||
)
|
||||
self.file_operations.remove_directory(solver_response["solver"].get("request_directory_path"))
|
||||
|
||||
|
||||
|
||||
self.logger.info("Cleaning up uuid directory: %s", efs_uuid_directory)
|
||||
self.file_operations.remove_directory(efs_uuid_directory)
|
||||
|
||||
self.logger.info("Sending notification that solving is complete")
|
||||
self.task_end_notifier.notify_task_end(self.ip_address)
|
||||
|
||||
except SolverException as e:
|
||||
self.logger.error("Failed to run solver on message with receipt handle %s", message.msg.receipt_handle)
|
||||
self.logger.exception(e)
|
||||
except SqsQueueException as e:
|
||||
self.logger.error("Failed to read from SQS queue: %s", self.queue.queue_name)
|
||||
self.logger.exception(e)
|
||||
except JSONDecodeError as e:
|
||||
self.logger.error("Failed to run solver on message with receipt handle %s", message.msg.receipt_handle)
|
||||
self.logger.error("Message is not valid Json: %s", message.read())
|
||||
self.logger.exception(e)
|
||||
except S3FileSystemException as e:
|
||||
self.logger.error("Failed to download file from s3")
|
||||
self.logger.exception(e)
|
||||
|
||||
|
||||
def wait_for_worker_nodes(self, num_workers):
|
||||
worker_nodes = []
|
||||
wait_time = 0
|
||||
self.logger.info(f"Waiting for {num_workers} worker nodes")
|
||||
while len(worker_nodes) < num_workers:
|
||||
worker_nodes = self.node_manifest.get_all_ready_worker_nodes()
|
||||
sleep(self.worker_poll_sleep_time)
|
||||
wait_time += self.worker_poll_sleep_time
|
||||
if wait_time > self.worker_poll_timeout:
|
||||
raise PollerTimeoutException(f"Timed out waiting for {num_workers} to report. "
|
||||
f"Only {len(worker_nodes)} reported")
|
||||
self.logger.info(f"Workers reported: {worker_nodes}")
|
||||
return worker_nodes
|
@ -0,0 +1 @@
|
||||
# Marker file that indicates this package supports typing
|
@ -0,0 +1,92 @@
|
||||
"""Implementation of classes for interacting with S3"""
|
||||
import logging
|
||||
import ntpath
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
|
||||
class S3FileSystemException(Exception):
|
||||
"""Exception for S3FileSystem errors"""
|
||||
|
||||
|
||||
class S3FileSystem:
|
||||
"""Class to represent an S3 storage"""
|
||||
|
||||
def __init__(self, s3_client):
|
||||
self.s3_client = s3_client
|
||||
self.logger = logging.getLogger("S3FileSystem")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
|
||||
def download_file(self, problem_uri: str, download_dest_folder: str):
|
||||
"""
|
||||
Function to download file based on user provided url
|
||||
@param download_dest_folder: folder to download the problem to
|
||||
@type problem_uri: s3 uri that is in the form of s3://bucket_name/path_to_file
|
||||
"""
|
||||
if problem_uri is None:
|
||||
raise S3FileSystemException('s3 uri is empty')
|
||||
|
||||
problem_uri = urlparse(problem_uri, allow_fragments=False)
|
||||
bucket_name = problem_uri.netloc
|
||||
file_path = problem_uri.path.lstrip('/')
|
||||
|
||||
# Get the file name
|
||||
file_name = ntpath.basename(file_path)
|
||||
download_dest = os.path.join(download_dest_folder, file_name)
|
||||
|
||||
try:
|
||||
self.logger.debug('Downloading file %s from bucket %s to destination %s', file_path, bucket_name, download_dest)
|
||||
self.s3_client.download_file(bucket_name, file_path, download_dest)
|
||||
except ClientError as e:
|
||||
self.logger.error("Failed to download file from s3")
|
||||
self.logger.exception(e)
|
||||
raise S3FileSystemException(f"Failed to download file from s3 with download destination {download_dest}")
|
||||
|
||||
return download_dest
|
||||
|
||||
def upload_file(self, local_file_path: str, bucket_name: str, object_name: str):
|
||||
try:
|
||||
if object_name.startswith('/'):
|
||||
object_name = object_name[1:]
|
||||
self.logger.debug('Uploading file %s to bucket %s and file path %s', local_file_path, bucket_name, object_name)
|
||||
self.s3_client.upload_file(local_file_path, bucket_name, object_name)
|
||||
except ClientError as e:
|
||||
self.logger.error("Failed to download file from s3")
|
||||
self.logger.exception(e)
|
||||
raise S3FileSystemException(f"Failed to upload file to s3 with upload destination {bucket_name}/{object_name}")
|
||||
|
||||
|
||||
def upload_file_uri(self, local_file_path: str, s3_uri: str):
|
||||
"""
|
||||
Function to upload file to s3 from local filesystem based on user provided path
|
||||
@param local_file_path: path to the problem to upload.
|
||||
@type s3_uri: s3 uri that is in the form of s3://bucket_name/path_to_file for uploading.
|
||||
"""
|
||||
problem_uri = urlparse(s3_uri, allow_fragments=False)
|
||||
bucket_name = problem_uri.netloc
|
||||
object_name = problem_uri.path.lstrip('/')
|
||||
# self.logger.info(f"bucket_name: {bucket_name}, object_name: {object_name}, ")
|
||||
self.upload_file(local_file_path, bucket_name, object_name)
|
||||
|
||||
|
||||
def upload_directory_tree(self, local_dir_path: str, s3_uri: str):
|
||||
self.logger
|
||||
problem_uri = urlparse(s3_uri, allow_fragments=False)
|
||||
bucket_name = problem_uri.netloc
|
||||
object_name_base = problem_uri.path
|
||||
|
||||
# iterate through all files and upload them to s3
|
||||
for root, dirs, files in os.walk(local_dir_path):
|
||||
for file in files:
|
||||
file_name = os.path.join(root, file)
|
||||
object_name = os.path.join(object_name_base, file_name)
|
||||
self.upload_file(file_name, bucket_name, object_name)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_s3_file_system():
|
||||
import boto3
|
||||
s3 = boto3.client('s3')
|
||||
return S3FileSystem(s3)
|
@ -0,0 +1,181 @@
|
||||
import logging
|
||||
from threading import TIMEOUT_MAX
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
|
||||
sys.path.append('../')
|
||||
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue
|
||||
|
||||
|
||||
class SolverTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class S3ProblemStore:
|
||||
"""Class to represent S3 storage location"""
|
||||
|
||||
def __init__(self, s3_resource, formula_bucket, logger):
|
||||
self.s3_resource = s3_resource
|
||||
self.logger = logger
|
||||
self.formula_bucket = formula_bucket
|
||||
|
||||
def list_cnf_file_s3_path_pairs(self):
|
||||
try:
|
||||
self.logger.debug(f'Attempting to list files for bucket {self.formula_bucket}')
|
||||
bkt = self.s3_resource.Bucket(self.formula_bucket)
|
||||
pairs = [(bkt_object.key, f's3://{self.formula_bucket}/{bkt_object.key}') for bkt_object in bkt.objects.all()]
|
||||
return pairs
|
||||
except ClientError as e:
|
||||
self.logger.error(f"Failed to list s3 bucket objects from {self.formula_bucket}")
|
||||
self.logger.exception(e)
|
||||
raise
|
||||
|
||||
def get_bucket(self):
|
||||
return self.formula_bucket
|
||||
|
||||
@staticmethod
|
||||
def get_s3_file_system(session, formula_bucket, logger):
|
||||
s3 = session.resource('s3')
|
||||
return S3ProblemStore(s3, formula_bucket, logger)
|
||||
|
||||
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
# maximum retries on non-success (other than timeout)
|
||||
RETRIES_MAX = 2
|
||||
TIMEOUT = 1030
|
||||
|
||||
|
||||
class ProblemRunner:
|
||||
"""Class to run a problem set through a distributed solver"""
|
||||
def __init__(self, problem_queue, result_queue, s3_problem_store, args, logger):
|
||||
self.num_workers = args.num_workers
|
||||
self.clean_first = args.clean_first
|
||||
self.json_file = args.json_file
|
||||
self.s3_problem_store = s3_problem_store
|
||||
self.problem_queue = problem_queue
|
||||
self.result_queue = result_queue
|
||||
self.logger = logger
|
||||
|
||||
|
||||
def run_one_problem(self, s3_uri):
|
||||
msg = {"s3_uri": s3_uri, "num_workers": self.num_workers}
|
||||
msg_str = json.dumps(msg, indent = 4)
|
||||
|
||||
done = False
|
||||
retries = 0
|
||||
while not done:
|
||||
self.problem_queue.put_message(msg_str)
|
||||
start_time = time.perf_counter()
|
||||
result = self.result_queue.get_message()
|
||||
|
||||
while result is None:
|
||||
self.logger.info(f"Awaiting completion for file: {s3_uri}")
|
||||
end_time = time.perf_counter()
|
||||
if end_time - start_time > TIMEOUT:
|
||||
raise SolverTimeoutException(f"Client exceeded max time waiting for response ({str(TIMEOUT)}). Did leader crash?")
|
||||
result = self.result_queue.get_message()
|
||||
|
||||
result_json = json.loads(result.read())
|
||||
result.delete()
|
||||
print(f"Problem {s3_uri} completed! result is: {json.dumps(result_json, indent=4)}")
|
||||
|
||||
if result_json["driver"]["timed_out"]:
|
||||
done = True
|
||||
else:
|
||||
result = (result_json["solver"]["output"]["result"]).lower()
|
||||
retries = retries + 1
|
||||
|
||||
if result == "unsatisfiable" or result == "satisfiable" or retries >= RETRIES_MAX:
|
||||
done = True
|
||||
|
||||
return result_json
|
||||
|
||||
def run_problems(self):
|
||||
results = {}
|
||||
if os.path.exists(self.json_file) and not self.clean_first:
|
||||
with open(self.json_file, "r") as result_file:
|
||||
results = json.load(result_file)
|
||||
|
||||
for (input_file, s3_uri) in self.s3_problem_store.list_cnf_file_s3_path_pairs():
|
||||
self.logger.info(f'attempting to solve file: {s3_uri}')
|
||||
|
||||
# skip previously solved files (unless 'clean' flag is true)
|
||||
if input_file in results:
|
||||
print(f"Problem {s3_uri} is cached from earlier run. Result is: {json.dumps(results[input_file], indent=4)}")
|
||||
continue
|
||||
|
||||
result_json = self.run_one_problem(s3_uri)
|
||||
|
||||
# write answer (overwrite existing file)
|
||||
results[input_file] = result_json
|
||||
|
||||
with open(self.json_file, "w") as result_file:
|
||||
json.dump(results, result_file, indent=4)
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run a SATComp solver through all files in a bucket"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--version", action="version",
|
||||
version = f"{parser.prog} version 0.1.0"
|
||||
)
|
||||
parser.add_argument('--profile', required = True, help = "AWS profile")
|
||||
parser.add_argument('--problem-queue', required=True, type=str, help='Name of the problem SQS queue (sends jobs to solver) ')
|
||||
parser.add_argument('--result-queue', required=True, type=str, help='Name of the result SQS queue (receives outputs from solver) ')
|
||||
parser.add_argument('--s3-bucket', required=True, type=str, help='Name of the s3 bucket')
|
||||
parser.add_argument('--num-workers', required=True, type=int, help='Number of workers in the cluster')
|
||||
parser.add_argument('--verbose', type=int, help='Set the verbosity level of output: 0 = ERROR, 1 = INFO, 2 = DEBUG (default: 0)')
|
||||
parser.add_argument('--clean-first', type=bool, help='Clean the output file prior to run (default: False)')
|
||||
parser.add_argument('--purge-queues', type=bool, help='Purge queues and wait one minute prior to start?')
|
||||
parser.add_argument('json_file', help='Path to json file containing results')
|
||||
return parser
|
||||
|
||||
def main():
|
||||
logger = logging.getLogger("satcomp_log")
|
||||
try:
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose and args.verbose > 0:
|
||||
if args.verbose == 1:
|
||||
logger.setLevel(logging.INFO)
|
||||
elif args.verbose >= 2:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.ERROR)
|
||||
|
||||
logger.debug('command line arguments: ' + str(args))
|
||||
|
||||
session = boto3.Session(profile_name=args.profile)
|
||||
s3 = session.resource('s3')
|
||||
|
||||
# create AWS access objects
|
||||
problem_queue = SqsQueue.get_sqs_queue_from_session(session, args.problem_queue)
|
||||
result_queue = SqsQueue.get_sqs_queue_from_session(session, args.result_queue)
|
||||
s3_problem_store = S3ProblemStore.get_s3_file_system(session, args.s3_bucket, logger)
|
||||
problem_runner = ProblemRunner(problem_queue, result_queue, s3_problem_store, args, logger)
|
||||
|
||||
if args.purge_queues:
|
||||
logger.info('Purging problem and result queues')
|
||||
problem_queue.purge()
|
||||
result_queue.purge()
|
||||
# recommendation of boto api
|
||||
time.sleep(60)
|
||||
|
||||
problem_runner.run_problems()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failure during 'run_satcomp_solver'. Error: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,120 @@
|
||||
"""
|
||||
This will contain the Solver implementation that shells out to a a solver
|
||||
executable and saves the logs
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
from json import JSONDecodeError
|
||||
|
||||
from arg_satcomp_solver_base.solver.solver import Solver
|
||||
from arg_satcomp_solver_base.solver.run_command import CommandRunner
|
||||
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
FAILED_TO_WRITE_PROBLEM_TEXT = "Failed to write problem text to file"
|
||||
|
||||
|
||||
class SolverException(Exception):
|
||||
"""Solver run failure exception"""
|
||||
|
||||
|
||||
class CommandLineSolver(Solver):
|
||||
"""Solver implementation that shells out to an executable wrapper for a solver executable and saves the logs"""
|
||||
solver_command: str
|
||||
command_runner: CommandRunner
|
||||
stdout_target_loc: str = "base_container_stdout.log"
|
||||
stderr_target_loc: str = "base_container_stderr.log"
|
||||
|
||||
DEFAULT_TIMEOUT_SECONDS: int = 3600
|
||||
|
||||
def __init__(self, solver_command: str):
|
||||
self.solver_command = solver_command
|
||||
self.command_runner = CommandRunner(self.stdout_target_loc, self.stderr_target_loc)
|
||||
self.file_operations = FileOperations()
|
||||
self.logger = logging.getLogger("CommandLineSolver")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
|
||||
def _save_input_json(self, formula_file: str, directory_path: str, workers: list, formula_language: str, solver_argument_list: list, timeout_seconds: str):
|
||||
input_json = {
|
||||
"formula_file": formula_file,
|
||||
"worker_node_ips": list(map(lambda w: w["nodeIp"], workers)),
|
||||
"formula_language": formula_language,
|
||||
"solver_argument_list": solver_argument_list,
|
||||
"timeout_seconds": timeout_seconds,
|
||||
}
|
||||
try:
|
||||
self.file_operations.write_json_file(os.path.join(directory_path, "input.json"), input_json)
|
||||
except OSError as e:
|
||||
self.logger.exception(e)
|
||||
raise SolverException(FAILED_TO_WRITE_PROBLEM_TEXT)
|
||||
|
||||
def _run_command(self, cmd: str, arguments: list, output_directory: str, timeout: int):
|
||||
"""Run a command as a subprocess and save logs"""
|
||||
cmd_list = [cmd]
|
||||
if arguments is not None:
|
||||
cmd_list.extend(arguments)
|
||||
process_result = self.command_runner.run(cmd_list, output_directory, timeout)
|
||||
return process_result
|
||||
|
||||
def _get_solver_result(self, request_directory_path):
|
||||
try:
|
||||
raw_solver_result = self.file_operations.read_json_file(os.path.join(request_directory_path, "solver_out.json"))
|
||||
except FileNotFoundError:
|
||||
self.logger.error("Solver did not generate output JSON")
|
||||
except JSONDecodeError as e:
|
||||
self.logger.error("Solver output not valid json")
|
||||
self.logger.exception(e)
|
||||
else:
|
||||
return raw_solver_result
|
||||
# If we cannot read the solver result, that is not an error state
|
||||
# since it is user provided and may not work
|
||||
return None
|
||||
|
||||
def solve(self, formula_file: str, request_directory_path: str, workers: list, task_uuid: str, timeout: int, formula_language: str, solver_argument_list: list):
|
||||
"""Solve implementation that shells out to a subprocess"""
|
||||
|
||||
self._save_input_json(formula_file, request_directory_path, workers, formula_language, solver_argument_list, timeout)
|
||||
|
||||
if not timeout:
|
||||
timeout = CommandLineSolver.DEFAULT_TIMEOUT_SECONDS
|
||||
try:
|
||||
process_result = self._run_command(self.solver_command, [request_directory_path],
|
||||
request_directory_path, timeout)
|
||||
except FileNotFoundError:
|
||||
self.logger.error(f"Failed to execute solver script at path: {self.solver_command}. "
|
||||
f"Solver executable does not exist")
|
||||
raise SolverException(f"Failed to execute solver script at path: {self.solver_command}. "
|
||||
f"Solver executable does not exist")
|
||||
|
||||
solver_result = self._get_solver_result(request_directory_path)
|
||||
self.logger.info(solver_result)
|
||||
if solver_result is None:
|
||||
task_state = {
|
||||
"status": "FAILED",
|
||||
"message": "Error retrieving solver result"
|
||||
}
|
||||
else:
|
||||
task_state = process_result.get("task_state")
|
||||
|
||||
solver_runtime_millis = None
|
||||
run_time_ns = process_result.get("run_time_ns")
|
||||
if run_time_ns:
|
||||
solver_runtime_millis = round(process_result.get("run_time_ns") / 1000000)
|
||||
"""
|
||||
Our output includes the stdout and stderr logs of the driver (the /satcomp/solver executable),
|
||||
as well as whatever JSON output is provided once that solver finishes.
|
||||
"""
|
||||
return {
|
||||
"task_id": task_uuid,
|
||||
"driver": {
|
||||
"stdout": os.path.join(request_directory_path, process_result.get("stdout")),
|
||||
"stderr": os.path.join(request_directory_path, process_result.get("stderr")),
|
||||
"return_code": process_result.get("return_code"),
|
||||
"solver_runtime_millis": solver_runtime_millis
|
||||
},
|
||||
"solver": {
|
||||
"output": solver_result,
|
||||
"request_directory_path": request_directory_path
|
||||
},
|
||||
"task_state": task_state
|
||||
}
|
@ -0,0 +1,70 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import threading
|
||||
import signal
|
||||
from dataclasses import dataclass
|
||||
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommandRunner:
|
||||
TIMEOUT_RETURNCODE = -100
|
||||
stdout_target_loc: str
|
||||
stderr_target_loc: str
|
||||
file_operations: FileOperations = FileOperations()
|
||||
|
||||
def __init__(self, stdout_target_loc, stderr_target_loc):
|
||||
self.logger = logging.getLogger("RunCommand")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.stdout_target_loc = stdout_target_loc
|
||||
self.stderr_target_loc = stderr_target_loc
|
||||
os.environ['PYTHONUNBUFFERED'] = "1"
|
||||
|
||||
def process_stream(self, stream, str_name, file_handle):
|
||||
line = stream.readline()
|
||||
while line != "":
|
||||
self.logger.info(f"{str_name}: {line}")
|
||||
file_handle.write(line)
|
||||
line = stream.readline()
|
||||
|
||||
def run(self, cmd: list, output_directory: str, time_out: int):
|
||||
self.logger.info("Running command: %s", str(cmd))
|
||||
with open(os.path.join(output_directory, self.stdout_target_loc), "w") as stdout_handle:
|
||||
with open(os.path.join(output_directory, self.stderr_target_loc), "w") as stderr_handle:
|
||||
try:
|
||||
start_time = time.perf_counter()
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
universal_newlines=True, start_new_session=True)
|
||||
stdout_t = threading.Thread(target = self.process_stream, args=(proc.stdout, "STDOUT", stdout_handle))
|
||||
stderr_t = threading.Thread(target = self.process_stream, args=(proc.stderr, "STDERR", stderr_handle))
|
||||
stdout_t.start()
|
||||
stderr_t.start()
|
||||
return_code = proc.wait(timeout = time_out)
|
||||
end_time = time.perf_counter()
|
||||
elapsed = end_time - start_time
|
||||
timed_out = False
|
||||
except subprocess.TimeoutExpired:
|
||||
timed_out = True
|
||||
elapsed = time_out
|
||||
self.logger.info("Timeout expired for process. Terminating process group w/sigterm")
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
|
||||
# wait 10 seconds for a graceful shutdown.
|
||||
try:
|
||||
return_code = proc.wait(timeout = 10)
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.info("Process unresponsive. Terminating process group w/sigkill")
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
|
||||
return_code = self.TIMEOUT_RETURNCODE
|
||||
stdout_t.join()
|
||||
stderr_t.join()
|
||||
return {
|
||||
"stdout": self.stdout_target_loc,
|
||||
"stderr": self.stderr_target_loc,
|
||||
"return_code": return_code,
|
||||
"output_directory": output_directory,
|
||||
"elapsed_time": elapsed,
|
||||
"timed_out": timed_out
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Solver:
|
||||
"""Solver interface"""
|
||||
def solve(self, formula_file: str, request_directory_path: str, workers: list, task_uuid: str, timeout: int, formula_language: str, solver_argument_list: list):
|
||||
"""Interface for solve command"""
|
@ -0,0 +1,100 @@
|
||||
"""Implementation of classes for interacting with SQS"""
|
||||
import logging
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
|
||||
class SqsQueueException(Exception):
|
||||
"""Exception for SqsQueue errors"""
|
||||
|
||||
|
||||
class QueueMessage:
|
||||
"""Class to represent an SQS message"""
|
||||
|
||||
def __init__(self, msg, queue_resource):
|
||||
self.logger = logging.getLogger("QueueMessage")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.queue_resource = queue_resource
|
||||
self.msg = msg
|
||||
|
||||
def read(self):
|
||||
return self.msg.body
|
||||
|
||||
def delete(self):
|
||||
|
||||
self.logger.info("Deleting message with receipt handle: %s", self.msg.receipt_handle)
|
||||
msg_txt = self.msg.body
|
||||
try:
|
||||
self.queue_resource.delete_messages(Entries=[
|
||||
{
|
||||
'Id': "test",
|
||||
'ReceiptHandle': self.msg.receipt_handle
|
||||
},
|
||||
])
|
||||
except ClientError as ex:
|
||||
self.logger.error("Failed to delete message from SQS queue")
|
||||
self.logger.exception(ex)
|
||||
raise ex
|
||||
else:
|
||||
return msg_txt
|
||||
|
||||
|
||||
class SqsQueue:
|
||||
"""Class to represent an SQS queue"""
|
||||
def __init__(self, queue_resource, queue_name):
|
||||
self.queue_resource = queue_resource
|
||||
self.queue_name = queue_name
|
||||
self.logger = logging.getLogger("SqsQueue")
|
||||
|
||||
def get_message(self):
|
||||
"""Get a single message off the queue if it exists"""
|
||||
try:
|
||||
self.logger.info("Trying to get message from queue %s", self.queue_name)
|
||||
messages = self.queue_resource.receive_messages(
|
||||
AttributeNames=['All'],
|
||||
MessageAttributeNames=['All'],
|
||||
MaxNumberOfMessages=1,
|
||||
WaitTimeSeconds=20,
|
||||
)
|
||||
except ClientError as e:
|
||||
self.logger.error("Failed to get message from SQS queue")
|
||||
self.logger.exception(e)
|
||||
raise SqsQueueException(f"Failed to get message from SQS queue {self.queue_name}")
|
||||
if messages is None or len(messages) == 0:
|
||||
return None
|
||||
return QueueMessage(messages[0], self.queue_resource)
|
||||
|
||||
def put_message(self, msg):
|
||||
"""Put a single message on the queue"""
|
||||
try:
|
||||
self.logger.info("Trying to put message onto queue %s", self.queue_name)
|
||||
messages = self.queue_resource.send_message(
|
||||
MessageBody=msg
|
||||
)
|
||||
except ClientError as e:
|
||||
self.logger.error("Failed to put message on SQS queue")
|
||||
self.logger.exception(e)
|
||||
raise SqsQueueException(f"Failed to put message on SQS queue {self.queue_name}")
|
||||
|
||||
def purge(self):
|
||||
try:
|
||||
self.logger.info(f"Trying to purge queue {self.queue_name}")
|
||||
self.queue_resource.purge()
|
||||
except ClientError as e:
|
||||
self.logger.error(f"Failed to purge queue {self.queue_name}")
|
||||
self.logger.exception(e)
|
||||
raise SqsQueueException(f"Failed to purge queue {self.queue_name}")
|
||||
|
||||
@staticmethod
|
||||
def get_sqs_queue(queue_name: str):
|
||||
import boto3
|
||||
sqs = boto3.resource('sqs')
|
||||
queue = sqs.get_queue_by_name(QueueName=queue_name)
|
||||
return SqsQueue(queue, queue_name)
|
||||
|
||||
@staticmethod
|
||||
def get_sqs_queue_from_session(session, queue_name: str):
|
||||
import boto3
|
||||
sqs = session.resource('sqs')
|
||||
queue = sqs.get_queue_by_name(QueueName=queue_name)
|
||||
return SqsQueue(queue, queue_name)
|
@ -0,0 +1 @@
|
||||
# Implement your code here.
|
@ -0,0 +1,38 @@
|
||||
import threading
|
||||
import polling2
|
||||
|
||||
from arg_satcomp_solver_base.solver.run_command import CommandRunner
|
||||
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
|
||||
|
||||
|
||||
class TaskEndNotificationPoller(threading.Thread):
|
||||
task_end_notifier: TaskEndNotifier
|
||||
command_runner: CommandRunner
|
||||
previous_notification_id: str = None
|
||||
|
||||
def __init__(self, task_end_notifier: TaskEndNotifier, command_runner: CommandRunner):
|
||||
threading.Thread.__init__(self)
|
||||
self.task_end_notifier = task_end_notifier
|
||||
self.command_runner = command_runner
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
self.wait_for_notification()
|
||||
self.command_runner.run(cmd=["/competition/cleanup"], output_directory="/tmp", time_out = 10)
|
||||
|
||||
@polling2.poll_decorator(check_success=lambda has_notification: has_notification, step=0.5, poll_forever=True)
|
||||
def wait_for_notification(self):
|
||||
return self._check_for_notification()
|
||||
|
||||
def _check_for_notification(self):
|
||||
notification_id = self.task_end_notifier.check_for_task_end(self.previous_notification_id)
|
||||
if notification_id is not None:
|
||||
self.previous_notification_id = notification_id
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_task_end_notification_poller():
|
||||
task_end_notifier = TaskEndNotifier.get_task_end_notifier()
|
||||
command_runner = CommandRunner("stdout.log", "stderr.log")
|
||||
return TaskEndNotificationPoller(task_end_notifier, command_runner)
|
@ -0,0 +1,57 @@
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
class TaskEndNotifier:
|
||||
|
||||
def __init__(self, table, notification_expiration_time=5):
|
||||
self.table = table
|
||||
self.node_expiration_time = notification_expiration_time
|
||||
|
||||
def notify_task_end(self, leader_ip: str):
|
||||
current_time = int(time.time())
|
||||
notification_id = str(uuid.uuid4())
|
||||
self.table.update_item(
|
||||
Key={
|
||||
'leaderIp': leader_ip
|
||||
},
|
||||
UpdateExpression="set notificationId = :notificationId, lastModified = :lastModifiedVal",
|
||||
ExpressionAttributeValues={
|
||||
":notificationId": notification_id,
|
||||
":lastModifiedVal": current_time
|
||||
}
|
||||
)
|
||||
|
||||
def check_for_task_end(self, previous_notification_id=None):
|
||||
expiration_time = int(time.time() - self.node_expiration_time)
|
||||
|
||||
filter_expression = "#lastModifiedKey > :lastModifiedVal"
|
||||
attribute_names = {
|
||||
"#lastModifiedKey": "lastModified"
|
||||
}
|
||||
attribute_values = {
|
||||
":lastModifiedVal": expiration_time
|
||||
}
|
||||
if previous_notification_id is not None:
|
||||
filter_expression += " and not (#notificationId = :notificationId)"
|
||||
attribute_values[":notificationId"] = previous_notification_id
|
||||
attribute_names["#notificationId"] = "notificationId"
|
||||
|
||||
notifications = self.table.scan(
|
||||
Select="ALL_ATTRIBUTES",
|
||||
FilterExpression=filter_expression,
|
||||
ExpressionAttributeNames=attribute_names,
|
||||
ExpressionAttributeValues=attribute_values,
|
||||
ConsistentRead=True
|
||||
)
|
||||
if len(notifications["Items"]) > 0:
|
||||
return notifications["Items"][0]["notificationId"]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_task_end_notifier():
|
||||
dynamodb_resource = boto3.resource("dynamodb")
|
||||
table = dynamodb_resource.Table("TaskEndNotification")
|
||||
return TaskEndNotifier(table)
|
@ -0,0 +1,60 @@
|
||||
"""General utils package"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
import pytz
|
||||
|
||||
|
||||
class TimeProvider:
|
||||
|
||||
def get_current_time(self):
|
||||
return datetime.now(pytz.utc)
|
||||
|
||||
|
||||
class FileOperations:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger("FileOperations")
|
||||
|
||||
def save_to_file(self, path: str, body: str):
|
||||
"""Save a string to a file
|
||||
Raises OSError if unable to open
|
||||
"""
|
||||
with open(path, "w", encoding="UTF-8") as file_handle:
|
||||
file_handle.write(body)
|
||||
|
||||
def create_directory(self, task_uuid: str):
|
||||
return self.create_custom_directory("/tmp/", task_uuid)
|
||||
|
||||
def create_custom_directory(self, directory_name, task_uuid: str):
|
||||
request_directory_path = os.path.join(directory_name, task_uuid)
|
||||
os.mkdir(request_directory_path)
|
||||
return request_directory_path
|
||||
|
||||
def read_json_file(self, path: str):
|
||||
with open(path) as solver_out_handle:
|
||||
return json.loads(solver_out_handle.read())
|
||||
|
||||
def write_json_file(self, path: str, item: dict):
|
||||
return self.save_to_file(path, json.dumps(item))
|
||||
|
||||
def read_log_file(self, file_path, max_file_length=None):
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
if max_file_length is not None:
|
||||
return f.read(max_file_length).decode("UTF-8")
|
||||
return f.read().decode("UTF-8")
|
||||
except FileNotFoundError as e:
|
||||
self.logger.error("Could not open log file %s", file_path)
|
||||
self.logger.exception(e)
|
||||
return None
|
||||
|
||||
def generate_uuid(self):
|
||||
return str(uuid.uuid4())
|
||||
|
||||
def remove_directory(self, path: str):
|
||||
if os.path.exists(path):
|
||||
shutil.rmtree(path)
|
@ -0,0 +1,72 @@
|
||||
"""
|
||||
This module runs in the background of the Worker Base Container.
|
||||
It will read worker node status from saved json file and updated manifest.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import threading
|
||||
import uuid
|
||||
import time
|
||||
|
||||
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest, NodeType
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
|
||||
class WorkerTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class WorkerStatusChecker(threading.Thread):
|
||||
"""Thread that runs in the background trying to get status of worker and
|
||||
updates manifest every second. Throws an exception if status is not updated for given expiration time"""
|
||||
|
||||
worker_poll_sleep_time = 1
|
||||
file_operations: FileOperations = FileOperations()
|
||||
|
||||
def __init__(self, node_manifest: DynamodbManifest, expiration_time: str, path: str):
|
||||
threading.Thread.__init__(self)
|
||||
self.node_manifest = node_manifest
|
||||
self.logger = logging.getLogger("WorkerPoller")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.expiration_time = expiration_time
|
||||
self.path = path
|
||||
|
||||
def get_worker_info(self):
|
||||
input = os.path.join(self.path, "worker_node_status.json")
|
||||
return self.file_operations.read_json_file(input)
|
||||
|
||||
def update_manifest(self, uuid: str, local_ip_address: str, status: str):
|
||||
self.logger.info(f"Giving node heartbeat for node {uuid} with ip {local_ip_address}")
|
||||
self.node_manifest.register_node(str(uuid), local_ip_address, status, NodeType.WORKER.name)
|
||||
|
||||
def run(self):
|
||||
local_ip_address = socket.gethostbyname(socket.gethostname())
|
||||
node_uuid = uuid.uuid4()
|
||||
self.logger.info(f"Registering node {node_uuid} with ip {local_ip_address}")
|
||||
while True:
|
||||
try:
|
||||
current_timestamp = int(time.time())
|
||||
self.logger.info("Trying to get worker node status")
|
||||
worker_status_info = self.get_worker_info()
|
||||
|
||||
status = worker_status_info.get("status")
|
||||
self.logger.info(f"Worker status is {status}")
|
||||
|
||||
worker_timestamp = worker_status_info.get("timestamp")
|
||||
self.logger.info(f"Worker status updated time: {worker_timestamp}")
|
||||
|
||||
if worker_timestamp < current_timestamp - self.expiration_time:
|
||||
raise WorkerTimeoutException("Timed out waiting for worker node status to update")
|
||||
|
||||
self.update_manifest(node_uuid, local_ip_address, status)
|
||||
|
||||
self.logger.info("#######################")
|
||||
time.sleep(self.worker_poll_sleep_time)
|
||||
|
||||
except FileNotFoundError:
|
||||
self.logger.error("worker_node_status file is not generated")
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error("Worker status file is not valid Json")
|
||||
self.logger.exception(e)
|
@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
"""This file is the main entrypoint for the Worker Node Base Container"""
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from time import sleep
|
||||
|
||||
sys.path.append('/opt/amazon/lib/python3.8/site-packages/')
|
||||
|
||||
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
|
||||
from arg_satcomp_solver_base.worker.worker import WorkerStatusChecker
|
||||
from arg_satcomp_solver_base.task_end_notification.task_end_notification_poller import TaskEndNotificationPoller
|
||||
from arg_satcomp_solver_base.utils import FileOperations
|
||||
|
||||
logging.getLogger("botocore").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("boto3").setLevel(logging.CRITICAL)
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
level=logging.DEBUG)
|
||||
|
||||
class PollerTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
def run_worker(path):
|
||||
cmd = os.path.join(path, "worker")
|
||||
stdout = os.path.join(path, "worker_stdout.log")
|
||||
stderr = os.path.join(path, "worker_stderr.log")
|
||||
with open(stdout, "wb") as out, open(stderr, "wb") as err:
|
||||
subprocess.Popen(cmd, stdout=out, stderr=err)
|
||||
|
||||
def write_leader_node_json(leader_nodes, dir):
|
||||
file_operations: FileOperations = FileOperations()
|
||||
output = os.path.join(dir, "leader_node_status.json")
|
||||
|
||||
## write most recently reporting leader
|
||||
leader_nodes.sort(key = lambda a: a['lastModified'], reverse=True)
|
||||
node_to_write = leader_nodes[0]
|
||||
|
||||
## get rid of decimal; incompatible with json writer
|
||||
node_to_write['lastModified'] = int(node_to_write['lastModified'])
|
||||
return file_operations.write_json_file(output, node_to_write)
|
||||
|
||||
|
||||
## Small hack to support getting leader IP node by workers.
|
||||
## Waits up to 10 minutes.
|
||||
def get_current_leader_nodes(node_manifest, logger):
|
||||
poll_sleep_time = 1
|
||||
poll_timeout = 600
|
||||
leader_nodes = []
|
||||
wait_time = 0
|
||||
logger.info(f"Waiting for 1 leader node")
|
||||
while True:
|
||||
leader_nodes = node_manifest.get_all_ready_leader_nodes()
|
||||
if len(leader_nodes) >= 1:
|
||||
break
|
||||
sleep(poll_sleep_time)
|
||||
wait_time += poll_sleep_time
|
||||
if wait_time > poll_timeout:
|
||||
raise PollerTimeoutException("Timed out waiting for leader to report. "
|
||||
"Zero leaders reported")
|
||||
|
||||
logger.info(f"Leaders reported: {leader_nodes}")
|
||||
return leader_nodes
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger = logging.getLogger("Worker Entrypoint")
|
||||
dir = "/competition"
|
||||
|
||||
logger.info("Getting node manifest")
|
||||
node_manifest = DynamodbManifest.get_dynamodb_manifest()
|
||||
|
||||
logger.info("Writing leader IP to json file")
|
||||
leaders = get_current_leader_nodes(node_manifest, logger)
|
||||
write_leader_node_json(leaders, dir)
|
||||
|
||||
logger.info("Running worker script from participant")
|
||||
run_worker(dir) # Run script to save status to work_node_status.json
|
||||
sleep(1) # Wait for creation of work_node_status.json file
|
||||
|
||||
|
||||
logger.info("Starting worker status checker")
|
||||
worker_status = WorkerStatusChecker(node_manifest, 5, dir)
|
||||
worker_status.start()
|
||||
|
||||
task_end_notification_poller = TaskEndNotificationPoller.get_task_end_notification_poller()
|
||||
task_end_notification_poller.start()
|
||||
|
@ -0,0 +1,15 @@
|
||||
FROM satcomp-infrastructure:common
|
||||
RUN apt-get install psmisc
|
||||
RUN pip3 install flask waitress boto3 pytz polling2
|
||||
RUN ls
|
||||
COPY --chown=ecs-user satcomp-solver-resources/ /opt/amazon/
|
||||
COPY --chown=ecs-user satcomp-worker/resources/worker /competition/worker
|
||||
COPY --chown=ecs-user satcomp-worker/resources/cleanup /competition/cleanup
|
||||
ENV PYTHONPATH=/opt/amazon:.:
|
||||
RUN chmod u+x /opt/amazon/arg_satcomp_solver_base/worker_entrypoint.py
|
||||
RUN chmod u+x /competition/worker
|
||||
RUN chmod u+x /competition/cleanup
|
||||
RUN service ssh start
|
||||
USER ecs-user
|
||||
EXPOSE 22
|
||||
ENTRYPOINT /usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config & /opt/amazon/arg_satcomp_solver_base/worker_entrypoint.py
|
@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
echo "Cleaning up: Killing all mpi processes"
|
||||
pkill -9 mpi
|
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
""" This is a test script that saves worker status to json file every second"""
|
||||
import json
|
||||
import time
|
||||
|
||||
|
||||
def update_worker_status():
|
||||
while True:
|
||||
data = {"status": "READY", "timestamp": int(time.time())}
|
||||
with open(
|
||||
"/competition/worker_node_status.json",
|
||||
"w",
|
||||
) as statusfile:
|
||||
statusfile.write(json.dumps(data))
|
||||
time.sleep(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_worker_status()
|
@ -0,0 +1,530 @@
|
||||
# Configuring AWS Infrastructure for SAT/SMT-Comp
|
||||
|
||||
This README describes how to build and configure AWS infrastructure for the SAT and SMT parallel and cloud tracks. If you are only competing in the parallel track, a handful of the steps are unnecessary. We will point out these cloud-only steps as we go.
|
||||
|
||||
You will proceed through the following steps.
|
||||
|
||||
* [Mallob Quickstart](#mallob-quickstart). This will run through all the steps using just two scripts so that you can see an example cloud solver up and running quickly. After we see it working, then we will explain the steps in more detail.
|
||||
* [Managing AWS Solver Infrastructure](#managing-solver-infrastructure). This section describes the infrastructure creation process in more detail, and how you can switch between parallel and cloud configurations for testing.
|
||||
* [Preparing Docker Images for Upload to AWS](#preparing-docker-images). This section describes how to upload your solver to AWS.
|
||||
* [Storing Analysis Problems in the Cloud](#storing-analysis-problems-in-the-cloud). This section describes how we store problems for your solver in S3, the main storage service for AWS.
|
||||
* [Running your Solver in the Cloud](#running-your-solver). This section describes how we run your solver in the cloud.
|
||||
|
||||
Additional resources are available in a FAQ:
|
||||
|
||||
* [FAQ/Troubleshooting](#faq--troubleshooting)
|
||||
|
||||
Although there is a lot of information about how to build with AWS infrastructure here, most of the development of your solver can be performed using Docker locally on your laptop as described in the [Solver Development README](../docker/README-Solver-Development.md) (even for the cloud track). You only need to involve AWS when you want to look at performance testing on the competition hardware.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
To use the AWS infrastructure, you will need the following tools installed:
|
||||
|
||||
- [python3](https://www.python.org/). To install the latest version for your platform, go to the [downloads](https://www.python.org/downloads/) page.
|
||||
- [docker](https://www.docker.com/). There is a button to download Docker Desktop on the main page.
|
||||
- [boto3](https://aws.amazon.com/sdk-for-python/). Once you have python3 installed (above), you can install this with `pip3 install boto3`.
|
||||
|
||||
Basic knowledge of AWS accounts and services is helpful, but we will walk you through all of the necessary steps.
|
||||
|
||||
We recommend that your development environment be hosted on Mac OS Monterey (v12.6), Amazon Linux 2 (AL2) or Ubuntu 20. Other platforms may work, but have not been tested.
|
||||
|
||||
### Installing the AWS CLI
|
||||
|
||||
In order to work with AWS, you must install the [AWS CLI](https://aws.amazon.com/cli/) for your platform. To get started, follow the directions for your operating system here:
|
||||
[https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
|
||||
If you have previously installed the AWS CLI, you should ensure it is up to date.
|
||||
|
||||
|
||||
### Creating AWS Credentials for your AWS Account
|
||||
|
||||
You must configure AWS credentials to access the resources in your account. For the purposes of simplifying the competition configurations, you will use a so-called _root level access key_. This is _not_ the best practice for security (which would be to create a separate user in your account) but it will suffice for the competition (see FAQ for more information).
|
||||
|
||||
In this section, you will be working with the Identity and Access Management (IAM) console. To create a root level access key go to the [IAM Console](https://console.aws.amazon.com/iamv2/). You can get to the console by clicking the previous link or by searching for "IAM" in the search field at the top of the [AWS Console](https://console.aws.amazon.com) as shown [here](readme-images/iam-search.png) and then clicking the resulting link).
|
||||
|
||||
On the IAM page, click "My Security Credentials" on the right side of the IAM console page as shown [here](readme-images/iam-quicklinks.png). Next, click on "Access keys (access key ID and secret access key)," then "Create New Access Key," and then "Show Access Key."
|
||||
This will create an Access Key ID and a Secret Access Key. Copy these for use in the next step.
|
||||
|
||||
Next, on your local workstation, create a `~/.aws/credentials` file containing the following text:
|
||||
|
||||
[default]
|
||||
aws_access_key_id=ACCESS_KEY_ID
|
||||
aws_secret_access_key=SECRET_ACCESS_KEY
|
||||
region=us-east-1
|
||||
|
||||
`ACCESS_KEY_ID` and `SECRET_ACCESS_KEY` are the keys that you created in the previous step.
|
||||
|
||||
After installing the AWS CLI and creating credentials, check your configuration by attempting to run an AWS command. An example command that should work is:
|
||||
```text
|
||||
aws sts get-caller-identity
|
||||
```
|
||||
|
||||
If you receive an error message, see the [FAQ/Troubleshooting](#faq--troubleshooting) section at the bottom of this document.
|
||||
|
||||
We recommend that when you've completed account set-up, you follow the steps in the FAQ [How Do I Track My Spending?](#how-do-i-track-my-spending) to track your spending. For now, we'll continue with the setup.
|
||||
|
||||
## Mallob Quickstart
|
||||
|
||||
To allow you to see a solver up and running quickly, we have created two commands to provide a quick start to getting a Mallob container running in AWS. The first command: `quickstart-build` creates the AWS infrastructure for a cloud solver, builds the Mallob docker images, and uploads them to AWS. The second command: `quickstart-run` creates a cluster to run with four nodes (1 leader and 3 workers); if it is not already running, waits for the cluster to be ready, then submits a solving job and reads the response from the output queue. After you test out the mallob solver using these commands, you should go through the rest of the readme to see the steps that they are performing and how to do this with your own solver.
|
||||
|
||||
### Quickstart Build
|
||||
|
||||
This command creates an account, builds Mallob (if not already built), and uploads the Docker images to AWS. The form is simply:
|
||||
|
||||
```text
|
||||
./quickstart-build
|
||||
```
|
||||
|
||||
This command will require 10-20 minutes to run. Once it is complete, Mallob should be ready to run, and the infrastructure should be set up to run a cloud solver.
|
||||
|
||||
### Quickstart Run
|
||||
|
||||
This command will spin up a cluster, submit a job to run called `test.cnf` that we uploaded to a storage location in S3 (the AWS storage service), and wait for the result, then spins down the cluster. The form of the command is:
|
||||
|
||||
```text
|
||||
./quickstart-run
|
||||
```
|
||||
|
||||
Once the solver is completed, you should see a message back from the solver that the problem was completed and that the result was SATISFIABLE. Later on we will show you how to monitor a run using an AWS service called _CloudWatch_ and extract the intermediate files produced by the run, including the stdout and stderr logs, using an AWS service called _S3_.
|
||||
|
||||
Voila!
|
||||
|
||||
Now we walk through the different steps in more detail.
|
||||
|
||||
## Managing Solver Infrastructure.
|
||||
|
||||
The `quickstart-build` command from the previous section creates the AWS infrastructure necessary to run your solver. In that creation process, the infrastructure is set up to run cloud (distributed) solvers. If you want to switch to hosting a parallel solver, run the update command described in the next section. If something goes wrong, then delete the infrastructure and re-create it as described in [Deleting and Re-creating the Infrastructure](#deleting-and-re-creating-the-infrastructure).
|
||||
|
||||
### Switching Between Cloud and Parallel Tracks
|
||||
|
||||
If you ran the Mallob `quickstart-build`, then you have already created your solver infrastructure and don't have to do any further work. In that creation process, the infrastructure is set up for cloud (distributed) solvers. Once the infrastructure has been created, if you want to update it to switch between machine configurations for the cloud and parallel tracks, you can run the `update-solver-infrastructure` script:
|
||||
|
||||
```text
|
||||
./update-solver-infrastructure --solver-type SOLVER_TYPE
|
||||
```
|
||||
|
||||
where:
|
||||
* `SOLVER_TYPE`: is either `cloud` or `parallel` depending on which kind of solver you are running. Note that we will run cloud solvers run on multiple 16-core machines ([m6i.4xlarge](https://aws.amazon.com/ec2/instance-types/)) with 64GB memory, while parallel solvers run on a single 64-core machine ([m6i.16xlarge](https://aws.amazon.com/ec2/instance-types/)) with 256GB memory.
|
||||
|
||||
This will change the instance type and memory configurations for the ECS images used. You can switch back and forth as needed.
|
||||
|
||||
### Deleting and Re-creating the Infrastructure
|
||||
|
||||
If something goes wrong and you want to start over, simply run the `delete-solver-infrastructure` script:
|
||||
```text
|
||||
./delete-solver-infrastructure
|
||||
```
|
||||
|
||||
This will delete the infrastructure and associated resources. **Note that this deletion includes any files that have been uploaded to your S3 bucket and also any ECR docker images that you have created.** It will not delete your AWS account or security credentials.
|
||||
|
||||
Next, you will create the AWS infrastructure necessary to build and test solvers. The SAT and SMT competitions both use the following infrastructure elements. These should "just work", but there is more information about the different parts of the infrastructure in the FAQ. To set up your resouces, simply run the `create-solver-infrastructure` script that we have provided.
|
||||
|
||||
```text
|
||||
./create-solver-infrastructure --solver-type SOLVER_TYPE
|
||||
```
|
||||
|
||||
where:
|
||||
* `SOLVER_TYPE`: is either `cloud` or `parallel` depending on which kind of solver you are running. Note that we will run cloud solvers run on multiple 16-core machines ([m6i.4xlarge](https://aws.amazon.com/ec2/instance-types/)) with 64GB memory, while parallel solvers run on a single 64-core machine ([m6i.16xlarge](https://aws.amazon.com/ec2/instance-types/)) with 256GB memory.
|
||||
|
||||
The script will take 5-10 minutes. When complete, all of the needed cloud resources should be created. The script polls until the creation is complete.
|
||||
|
||||
|
||||
|
||||
## Preparing Docker Images
|
||||
|
||||
### Creating the Base Docker Leader and Worker Images for Solvers
|
||||
|
||||
See the [SAT-Comp Docker Images README.md file](../docker/README.md) in the `docker` directory for instructions on how to build and test Docker images.
|
||||
|
||||
### Storing Solver Images in the ECR repository
|
||||
|
||||
Amazon stores your solver images in the [Elastic Container Registry (ECR)](https://console.aws.amazon.com/ecr).
|
||||
|
||||
The `create-solver-infrastructure` command described earlier creates an ECR repository with the same name as the project (comp23).
|
||||
|
||||
This repository will store the images for the leader and worker. Once you have created and tested a docker image (or images, for the cloud leader and worker) as described in the [SAT-Comp Docker Images README.md file](../Docker/README.md), you can upload them to your AWS account with the `ecr-push` script:
|
||||
|
||||
```text
|
||||
./ecr-push [--leader LEADER_IMAGE_TAG] [--worker WORKER_IMAGE_TAG]
|
||||
```
|
||||
|
||||
where:
|
||||
|
||||
* `LEADER_IMAGE_TAG` is the tagged name of the leader docker image (e.g. satcomp-mallob:leader).
|
||||
|
||||
* `WORKER_IMAGE_TAG` is the tagged name of the leader docker image (e.g. satcomp-mallob:worker).
|
||||
|
||||
The leader and worker tags are optional; you can upload one or both docker images with this command (though if neither is specified, the script exits with an error).
|
||||
|
||||
## Storing Analysis Problems in the Cloud
|
||||
|
||||
We use the AWS Simple Storage Service (S3) to store the analysis problems we want to solve with our solver. S3 has a concept of a "bucket" which acts like a filesystem. As part of the `create-solver-infrastructure` CloudFormation script, we have created a bucket for you where you can store files: `ACCOUNT_NUMBER-us-east-1-comp23`, and added a `test.cnf` file to this bucket for testing. You can start with the `test.cnf` example and skip the rest of this section until you wish to add additional files or buckets for testing your solver.
|
||||
|
||||
You can copy files to the bucket with a command similar to this one (when executed from the root directory of this repository, this re-copies the `my-problem.cnf` file to the default bucket):
|
||||
|
||||
```text
|
||||
aws s3 cp my-problem.cnf s3://ACCOUNT_NUMBER-us-east-1-comp23
|
||||
```
|
||||
|
||||
When `s3 cp` is complete, you will see your file(s) in the list of objects in the bucket:
|
||||
|
||||
```text
|
||||
aws s3 ls s3://ACCOUNT_NUMBER-us-east-1-comp23
|
||||
```
|
||||
|
||||
More information on creating and managing S3 buckets is found here: [https://aws.amazon.com/s3/](https://aws.amazon.com/s3/). The S3 command line interface is described in more detail here: [https://docs.aws.amazon.com/cli/latest/userguide/cli-services-s3-commands.html](https://docs.aws.amazon.com/cli/latest/userguide/cli-services-s3-commands.html).
|
||||
|
||||
## Running Your Solver
|
||||
After storing docker images: `comp23:leader` (and for the cloud solver: `comp23:worker`) and placing at least one query file in your S3 bucket, you are ready to run your solver.
|
||||
|
||||
### Running Your Solver Using `quickstart-run`
|
||||
You can use `quickstart-run` to spin up the cluster, run several files, and then spin down the cluster with one command. To run files other than the `test.cnf` file, you can provide a list of s3 file paths to run, one after the other. The command looks like:
|
||||
|
||||
```text
|
||||
quickstart-run [--s3-locations S3_LOCATIONS [S3_LOCATIONS ...]]
|
||||
```
|
||||
|
||||
In this case, after creating the cluster, it will try to run all files before spinning the cluster back down. This allows you a relatively safe way to run several jobs. **N.B.**: Keep an eye on it, though! If your solver fails while solving and does not generate an output, then the script will poll forever waiting for the result message from your solver, and will not spin down the cluster. In this case, follow the cleanup procedure below.
|
||||
|
||||
|
||||
### Running Your Solver Using Basic Scripts
|
||||
|
||||
If you would like to have finer-grained control, choosing when to spin up and down the cluster, or building your own scripts, you can use the following steps:
|
||||
|
||||
1. Setup.
|
||||
|
||||
a. Setting capacity for the cluster in [EC2](https://aws.amazon.com/ec2/).
|
||||
|
||||
b. Setting the desired number of workers in [ECS](https://aws.amazon.com/ecs/).
|
||||
|
||||
2. Run. submitting a solve job to the SQS queue by specifying the S3 location of the problem and the desired number of workers.
|
||||
|
||||
3. Cleanup. Setting EC2/ECS capacity for cluster back to zero.
|
||||
|
||||
After setup, you can run any number of solve jobs before cleaning up.
|
||||
|
||||
Note: It is very important that you [clean up](#cluster-teardown) once you are done. EC2 nodes that remain active continue to be charged against your account.
|
||||
|
||||
### Cluster Setup
|
||||
|
||||
To set up and tear down the ECS cluster, we have provided a script called `ecs-config`. You provide the number of desired worker nodes as an argument. The script:
|
||||
|
||||
1. Creates an EC2 fleet large enough to run the experiment (number of workers + a leader node)
|
||||
2. Creates the correct number of tasks in the ECS service for the leader and workers.
|
||||
|
||||
To run the script:
|
||||
|
||||
```text
|
||||
ecs-config setup --workers [NUM_WORKERS]
|
||||
```
|
||||
|
||||
where:
|
||||
|
||||
* `NUM_WORKERS` is the number of worker nodes you want to allocate. To avoid problems with possible resource limits for your AWS account, we recommend that you set `NUM_WORKERS` to `1` when working through this document for the first time. If you are building for the parallel track, set `NUM_WORKERS` to `0`, as all solving will be performed by the leader node.
|
||||
|
||||
AWS typically requires 2-5 minutes to allocate nodes and host the ECS cluster. You can monitor the creation process by navigating to the [ECS console](https://us-east-1.console.aws.amazon.com/ecs) and selecting the _SatCompCluster_ link. [FIXME: link won't work outside us-east-1.]
|
||||
|
||||
The next page will show a list of job queues, including:
|
||||
|
||||
```text
|
||||
job-queue-comp23-SolverLeaderService-...
|
||||
job-queue-comp23-SolverWorkerService-...
|
||||
```
|
||||
|
||||
The service is running and available when the number of running tasks for the leader is `1` and the number of running tasks for the Worker service is `n`, as chosen with `NUM_WORKERS` in the `ecs-config` script argument.
|
||||
|
||||
Note: Your AWS account is charged for the number of EC2 instances that you run, so your bill will rise with the number of workers that you allocate.
|
||||
|
||||
### Job Submission and Execution
|
||||
|
||||
Before submitting a job, check that a cluster is set up and running, and the desired problem is available in an accessible S3 bucket.
|
||||
|
||||
Solver jobs are submitted by sending SQS messages. We have provided a `send_message` script to do this for you. You provide the S3 location of the file to run and number of desired worker nodes. The script submits an SQS request to the `ACCOUNT_NUMBER-us-east-1-SatCompQueue` queue, which the solver leader container is monitoring.
|
||||
|
||||
To run the script:
|
||||
|
||||
```text
|
||||
send_message --location S3_LOCATION --workers NUM_WORKERS [--timeout TIMEOUT] [--name NAME] [--format FORMAT] [--args ARGS]
|
||||
```
|
||||
|
||||
with required arguments:
|
||||
* `S3_LOCATION` is the S3 location of the query file. For example, for the bucket we described earlier, the location would be `s3://ACCOUNT_NUMBER-us-east-1-comp23/test.cnf`.
|
||||
* `NUM_WORKERS` is the number of worker nodes to allocate for this problem. Again, we recommend that you start with `NUM_WORKERS` as `1` when beginning. For parallel solvers, you should set `NUM_WORKERS` to `0`.
|
||||
|
||||
and optional arguments:
|
||||
* `TIMEOUT` is an optional parameter that sets the timeout in seconds for the solver. Defaults to 60s.
|
||||
* `NAME` is an optional parameter providing a name for the solver (in case you want to host multiple solvers within one container. Defaults to the empty string)
|
||||
* `FORMAT` is an optional parameter providing the problem format. Defaults to the empty string.
|
||||
* `ARGS` is an optional parameter allowing one or more arguments to be passed to the solver. Defaults to the empty list.
|
||||
|
||||
The leader container will pull the message off of the queue, fetch the problem from S3, and begin execution. For more information on the steps performed, see [Extending the Solver Base Container](#understanding-the-solver-architecture-and-extending-the-competition-base-leader-container) below.
|
||||
|
||||
### Debugging on the Cloud: Monitoring and Logging
|
||||
|
||||
Debugging takes more effort on the cloud than on your local machine, because it takes time to upload your solver and deploy it, and observability can be reduced. Debug first at small scale on your local machine as described in the [Solver Development README](../docker/README-Solver-Development.md).
|
||||
|
||||
To debug on the cloud, there are two main mechanisms that we use to examine our solvers:
|
||||
1. CloudWatch log groups, which allow a real-time view of stdout and stderr from your solver.
|
||||
1. S3 storage of intermediate files, which allows you to dump files that you can inspect after the run.
|
||||
|
||||
**CloudWatch Log Groups:** To watch the logs in real-time you can navigate to the CloudWatch console, then choose the "log groups" item on the left side menu as shown in Figure 2.
|
||||
|
||||
![](readme-images/cloudwatch-menu.png)
|
||||
_Figure 2: CloudWatch log groups menu_
|
||||
|
||||
After choosing the log groups menu item, you should see logs related to `/ecs/comp23-leader` and `/ecs/comp23-worker` as shown in Figure 3.
|
||||
|
||||
![](readme-images/cloudwatch-log-groups.png)
|
||||
_Figure 3: CloudWatch log groups view_
|
||||
|
||||
These logs will capture both stdout and stderr from your solver containers. The logs are captured in something called _streams_, usually tied to a specific instance (so, for a cluster you will have one stream for the leader and one for each worker). Note that if you don't use a solver for a while, CloudWatch will close the stream and open a new one, as you see in Figure 4:
|
||||
|
||||
![](readme-images/cloudwatch-log-streams.png)
|
||||
_Figure 4: CloudWatch log groups view_
|
||||
|
||||
The logs themselves are visible as lists of events as shown in Figure 5:
|
||||
|
||||
![](readme-images/cloudwatch-log-items.png)
|
||||
_Figure 5: CloudWatch log items view_
|
||||
|
||||
AWS provides a very full-featured log query language called [CloudWatch Logs Insights](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/AnalyzingLogData.html) (visible in the menu in Figure 2) that you can use to search for specific patterns within logs that can often help speed up your debugging. A full discussion of insights is outside the scope of this document, but the link above leads to a full set of documentation about how use it.
|
||||
|
||||
**S3 Storage of Intermediate Files:**
|
||||
When the solver is invoked we store the `input.json` file and the analysis problem in a temporary directory that is passed as the argument to the solver. We recommend that you use the same directory (or create a subdirectory) for storing all of the intermediate files used during solving. At the conclusion of solving, all contents in this directory _for the leader solver_ will be uploaded to your S3 bucket (`ACCOUNT_NUMBER-us-east-1-comp23`) under a `/tmp/UUID` directory with a unique UUID so that you can inspect them offline. This way, you can instrument your solvers to write to the filesystem for later analysis. **N.B.:** currently this is only available for the leader solver; however, we may add the capability to upload data from the workers depending on customer feedback. If you store the stdout and stderr logs to this directory (as the Mallob example does), then the output message produced by the solver will identify the UUID used for the bucket for easy reference.
|
||||
|
||||
### Cluster Teardown
|
||||
|
||||
After testing, remember to tear down the cluster by running:
|
||||
|
||||
```text
|
||||
ecs-config shutdown
|
||||
```
|
||||
|
||||
This will terminate all EC2 instances and reset the number of leaders and workers to zero in the ECS service.
|
||||
|
||||
You should verify that no EC2 instances are running after running the teardown script. Navigate to the [EC2 console](https://console.aws.amazon.com/ec2) and check the `Instances` tab. There should be no running instances. Note that it might take a minute or two for the instances to shut down after running the script.
|
||||
|
||||
Note: You are responsible for any charges made to your AWS account. While we have tested the shutdown script and believe it to be robust, you are responsible for monitoring the EC2 cluster to make sure resources have shutdown properly. We have alternate directions to shutdown resources using the console in the Q&A section.
|
||||
|
||||
|
||||
|
||||
## FAQ / Troubleshooting
|
||||
|
||||
#### How Do I Track My Spending?
|
||||
|
||||
[AWS CloudFormation](https://aws.amazon.com/cloudformation/) can be used to automate many cloud configuration activities. The file `setup-account.yaml`[setup-account.yaml] in this repository is an optional CloudFormation script that can help you track your spending. It sets up notification emails to be sent via email when your account reaches 20%, 40%, 60%, 80%, 90%, 95%, and 100% of the monthly account budget. This will provide a window into the current spend rate for building and testing your solver.
|
||||
|
||||
Here is the command to run it:
|
||||
|
||||
aws cloudformation create-stack --stack-name setup-account-stack --template-body file://setup-account.yaml --parameters ParameterKey=emailAddress,ParameterValue=YOUR_EMAIL_ADDRESS
|
||||
|
||||
_Note_: Be sure to double check the email address is correct!
|
||||
|
||||
After running the aws cloudformation command, you can monitor the installation process by logging into your AWS account and navigating to the [CloudFormation console](https://console.aws.amazon.com/cloudformation).
|
||||
Make sure you are in the us-east-1 region (You can select the region in the top right corner of the console page).
|
||||
You should see a stack named `setup-account-stack`.
|
||||
|
||||
![](readme-images/cloudformation.png)
|
||||
_Figure 1: Cloudformation result_
|
||||
|
||||
By clicking on this stack, and choosing `events`, you can see the resources associated with the stack. After a short time, you should see the `CREATE_SUCCEEDED` event. If not (e.g., the email address was not valid email address syntax), you will see a `CREATE_FAILED` event. In this case, delete the stack and try again. If you have trouble, email us at: [sat-comp@amazon.com](mailto:sat-comp@amazon.com) and we will walk you through the process.
|
||||
|
||||
Although it is handy to get emails when certain account budget thresholds have been met, it is both useful and important to check by-the-minute account spending on the console: [https://console.aws.amazon.com/billing/home](https://console.aws.amazon.com/billing/home).
|
||||
|
||||
|
||||
#### I'm only submitting to the parallel track and not the cloud track. Do I need a worker image?
|
||||
|
||||
No. If you are submitting to the parallel track only, you do not need a worker image. For the parallel track, we will assume that the leader manages all threading and communications within the single (multi-core) compute node.
|
||||
|
||||
#### How do I manage my account after SAT-Comp if I want to follow security best practices?
|
||||
|
||||
If you continue using your AWS account after the competition, we recommend that you follow AWS best practices as described here:
|
||||
[https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html#create-iam-users](https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html#create-iam-users)
|
||||
|
||||
#### What are the sequence of steps performed by the leader container to set up the `input.json' file?
|
||||
|
||||
The leader infrastructure performs the following steps:
|
||||
|
||||
1. Pull and parse a message from the `ACCOUNT_NUMBER-us-east-1-SatCompQueue` SQS queue with the format described in the [Job Submission and Execution section](../infrastructure/README.md#fixme).
|
||||
1. Pull the appropriate solver problem from S3 from the location provided in the SQS message.
|
||||
1. Save the solver problem on a shared EFS drive so that it can also be accessed by the worker nodes.
|
||||
1. Wait until the requested number of workers have reported their status as READY along with their IP addresses.
|
||||
1. Create a working directory for this problem.
|
||||
1. Create and write an `input.json` with the IP addresses of the worker nodes as well as the location of the problem to the working directory
|
||||
1. Invoke the executable script located at path `/competition/solver` with a single parameter: the path to the working directory. The solver script will look for the `input.json` file in the working directory. It is also the location where solver output and error logs will be written.
|
||||
1. The return code for `/competition/solver` will determine the expected result for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
|
||||
1. Upon task completion, notify all workers that the task has ended.
|
||||
|
||||
#### I ran the infrastructure scripts, and they seemed to work, but when I go out to the console, I don't see any of the infrastructure: S3 buckets, ECS Queues, etc. What happened?
|
||||
|
||||
The most common mistake people make when starting with AWS is not choosing the correct *region* for executing their jobs. In the console on the top right there is a selectable region:
|
||||
|
||||
<img src="readme-images/regions.png" alt="image of region selector" width="200"/>
|
||||
|
||||
Make sure that the region selected in the top-right of the console page is `us-east-1`.
|
||||
|
||||
#### I created a leader node and submitted a job, but the leader keeps saying it is waiting for worker nodes and not running my solver. What happened?
|
||||
|
||||
There are two likely causes. First it could be that you submitted an SQS that asked for more workers than you configured when you set up your cluster. Make sure that the number of worker nodes equals or exceeds the number of worker nodes that were requested in the SQS message. Second, if you just set up the ECS cluster, then it is likely that either the requested EC2 servers have not yet initialized, or that the ECS cluster has not yet started running on them. It takes 2-5 minutes for the resources to become available after they are requested.
|
||||
|
||||
There is also a less-likely cause, involving capacity limits. EC2 sets default capacity limits on the number of compute nodes that are available to an account. If you are running large-scale tests, it may be that you have exceeded the default number of nodes. To check, navigate to the EC2 console for the account, and click on "Limits" on the left side. Type "All standard" in the search bar and determine the current limit. The limit is specified in terms of vCPUs, so each m6i-4xlarge image uses 16 vCPUs, and every m6i-16xlarge uses 64 CPUs. If you need additional vCPUs, click on the "All Standard" link and request a limit increase. **N.B.** Running a large cluster can become expensive. Make sure you limit your large-scale testing to preserve your AWS credits.
|
||||
|
||||
#### I'm watching the ECS cluster, but nothing is happening; no nodes are being created. What happened?
|
||||
|
||||
This can happen if you try to start up a cloud solver when the infrastructure is configured for parallel execution or vice versa. In this case, the machine image where you want to deploy does not match the requirements of the ECS task. Make sure that if you are running a cloud solver, you have not configured the infrastructure for parallel. If you are unsure, you can always run `update-solver-infrastructure` with the expected type. If the infrastructure is already set up for this type, it will be a no-op.
|
||||
|
||||
|
||||
#### Suppose I want to use the console to setup my ECS clusters, or to monitor them. How do I do that?
|
||||
|
||||
__Step 1:__ Update the size of the EC2 cluster using the EC2 console
|
||||
|
||||
To control the instances in your cluster, go to the EC2 console and scroll down on the left side of the console and click on the link that says "Auto Scaling Groups".
|
||||
|
||||
In the next page, you will see an autoscaling group called something like `job-queue-comp23-EcsInstance`.
|
||||
|
||||
1. Select the queue by clicking on it, then click the "Edit" button in the "Group details" section.
|
||||
1. Set the desired, and maximum task capacity to n (where n includes 1 leader and n-1 workers, so there is a minimum useful size of 2), and click "Update".
|
||||
|
||||
__Step 2:__ Update the size of the ECS task pool for the leader and workers
|
||||
|
||||
Navigate to the ECS console, then select the SatCompCluster link.
|
||||
|
||||
The next page should show a list of job queues, including:
|
||||
|
||||
```text
|
||||
job-queue-comp23-SolverLeaderService-...
|
||||
job-queue-comp23-SolverworkerService-...
|
||||
```
|
||||
|
||||
Click on the SolverLeaderService job queue and choose "Update". Set the number of tasks to 1, then choose "Skip to review", then click "Update Service". You should then navigate back to the SatCompCluster link and click on the SolverWorkerService link. Choose "Update" and set the number of tasks to n-1, where 'n' is the number of EC2 nodes you created in Setup Step 1.
|
||||
|
||||
Please allow 2-3 minutes for the instances to boot up and register with the cluster. When registered, you should see 1 desired and 1 running task for the leader and n-1 desired tasks for the worker in the SatCompCluster screen.
|
||||
|
||||
If you get the following error, it means that your instances have not yet booted up:
|
||||
|
||||
```text
|
||||
An error occurred (InvalidParameterException) when calling the RunTask operation: No Container Instances were found in your cluster.
|
||||
```
|
||||
|
||||
If they fail to boot up after 5 minutes, please verify that both Desired Capacity and Maximum Capacity are set correctly.
|
||||
|
||||
You incur costs for the time the cluster is running.
|
||||
|
||||
#### Suppose I want to use the console to tear down my ECS clusters. How do I do that?
|
||||
|
||||
To control the instances in your cluster, go to the EC2 console and scroll down on the left side of the console and click on the link that says "Auto Scaling Groups".
|
||||
|
||||
In the next page, you will see an autoscaling group called something like job-queue-comp23-EcsInstance.
|
||||
|
||||
1. Select the queue by clicking on it, then click the "Edit" button in the "Group details" section.
|
||||
1. Set the desired and maximum task capacity to 0. This shuts down any EC2 instances.
|
||||
|
||||
|
||||
#### What are the various AWS services that are used in the infrastructure?
|
||||
|
||||
The infrastructure pieces are as follows:
|
||||
|
||||
* Compute resources necessary to host solvers, provided by the [Elastic Compute Cloud](https://aws.amazon.com/ec2/) (EC2) service.
|
||||
* A registry for solver Docker image files, provided by the [Elastic Container Registry](https://aws.amazon.com/ecr/) (ECR) service.
|
||||
* A compute environment to support automated launching of containers on compute resources, provided by the [Elastic Container Service](https://aws.amazon.com/ecs/) (ECS) service.
|
||||
* A storage location for solver queries that uses the [Simple Storage Service](https://aws.amazon.com/s3/) (S3) service.
|
||||
* A distributed file system that can be used for communication between leader and worker solvers, using the [Elastic File System](https://aws.amazon.com/efs/) (EFS) service.
|
||||
* A problem queue that the solver leader uses to extract the next problem to solve, provided by the [Simple Queue Service](https://aws.amazon.com/sqs/) (SQS).
|
||||
|
||||
You can click the links about for more specifics on each each of the services.
|
||||
|
||||
#### Suppose I want to use the console to send SQS messages to start executing jobs. How do I do that?
|
||||
|
||||
Submit a job using the [Simple Queue Service (SQS) console](https://console.aws.amazon.com/sqs/).
|
||||
|
||||
1. Navigate to the SQS console, and select the queue named `ACCOUNT_NUMBER-us-east-1-SatCompQueue`.
|
||||
2. Click the *SendAndReceiveMessages* button
|
||||
3. Set the message body to something matching the following structure:
|
||||
|
||||
```text
|
||||
{"s3_uri":"s3://PATH_TO_CNF_PROBLEM",
|
||||
"num_workers": DESIRED_NUMBER_OF_WORKERS}
|
||||
```
|
||||
|
||||
For example, for the bucket we described earlier, given a cluster with two nodes (one worker), it would be:
|
||||
|
||||
```text
|
||||
{"s3_uri":"s3://ACCOUNT_NUMBER-us-east-1-satcompbucket/test.cnf",
|
||||
"num_workers": 1}
|
||||
```
|
||||
|
||||
#### What if I want to create different buckets other than the one provided for storing problems in?
|
||||
|
||||
In case you wish to create a different bucket, here is a command to create a bucket named `comp23-satcomp-examples`:
|
||||
|
||||
```text
|
||||
aws s3api create-bucket --bucket comp23-satcomp-examples
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- This will create the bucket in the AWS region specified in your named profile
|
||||
|
||||
- If you chose a different region than `us-east-1`, you might get an `IllegalLocationConstraintException`. To deal with the problem, you have to append the argument `--create-bucket-configuration {"LocationConstraint": "YOUR-REGION-NAME"}` to the command, where `YOUR-REGION-NAME` is replaced with the name of the region you chose.
|
||||
|
||||
- S3 buckets are globally unique across all of AWS, which means that the command will fail if someone else is already using the same bucket name. A simple fix is to add the AWS account number to the bucket name, which will likely yield a unique bucket name.
|
||||
|
||||
#### I'd like to know more about how the Elastic Container Registry works. Can I do the steps to upload files by hand?
|
||||
|
||||
Amazon stores your solver images in the [Elastic Container Registry (ECR)](https://console.aws.amazon.com/ecr).
|
||||
|
||||
The `create-solver-infrastructure` command described earlier creates an ECR repository named `comp23`.
|
||||
|
||||
|
||||
This repository store the images for the leader and worker.
|
||||
|
||||
The repository has an associated URI (shown on the console page), which is what we use to push docker images to ECR.
|
||||
The format for URIs is
|
||||
|
||||
```text
|
||||
ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23
|
||||
```
|
||||
|
||||
You will use these URIs to describe where to store our Docker images in AWS.
|
||||
|
||||
Note: a good way to make sure that you type these URI names correctly is to navigate to the Elastic Container Registry (ECR) console in AWS and copy them from the page (the names are in the URI column).
|
||||
|
||||
|
||||
To store a Docker image in ECR from your local machine, first you need to login to ECR:
|
||||
|
||||
```text
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin AWS\_ACCOUNT\_ID.dkr.ecr.us-east-1.amazonaws.com
|
||||
```
|
||||
|
||||
Next, you need to tag the image to match the ECR repository. For the worker, tag the image as:
|
||||
|
||||
```text
|
||||
docker tag [LOCAL_WORKER_IMAGE_ID] [AWS_ACCOUNT_NUMBER].dkr.ecr.us-east-1.amazonaws.com/comp23:worker
|
||||
```
|
||||
|
||||
where
|
||||
|
||||
* **LOCAL\_WORKER\_IMAGE\_ID** is the local worker image tag (e.g., `comp23:worker`).
|
||||
* **AWS\_ACCOUNT\_ID** is the account ID where you want to store the image.
|
||||
|
||||
For the leader, tag the image as:
|
||||
|
||||
```text
|
||||
docker tag LOCAL_LEADER_IMAGE_ID ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:leader
|
||||
```
|
||||
|
||||
where
|
||||
|
||||
* **LOCAL\_LEADER\_IMAGE\_ID** is the local image for the leader, and the other parameters are the same as for the worker.
|
||||
|
||||
After these steps, you can docker push the images:
|
||||
|
||||
```text
|
||||
docker push ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:leader
|
||||
docker push ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:worker
|
||||
```
|
||||
|
||||
You should see network progress bars for both images.
|
||||
|
||||
More information related to the ECR procedures is described here: [https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html).
|
||||
|
||||
If you have trouble, please email us at: [sat-comp@amazon.com](mailto:sat-comp@amazon.com) and we will walk you through the process.
|
@ -0,0 +1,2 @@
|
||||
#!/bin/sh
|
||||
python3 manage-solver-infrastructure.py --mode create $*
|
@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
# --solver-type doesn't matter in this case, so we provide it.
|
||||
python3 manage-solver-infrastructure.py --mode delete --solver-type parallel $*
|
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError, ProfileNotFound
|
||||
import sys
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Runner")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class STS:
|
||||
def __init__(self, sts_client) -> None:
|
||||
self.sts = sts_client
|
||||
|
||||
def get_account_number(self) -> str:
|
||||
try:
|
||||
return self.sts.get_caller_identity()["Account"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get profile account number: {e}")
|
||||
raise e
|
||||
|
||||
def arg_parser() -> dict:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--profile', required = False, help = "AWS profile")
|
||||
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
|
||||
parser.add_argument('--leader', required = False, help = "Docker tag name of the leader container")
|
||||
parser.add_argument('--worker', required = False, help = "Docker tag name of the worker container (only for cloud track)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# argument checking.
|
||||
# matcher = re.compile(allowable_project_names_mask)
|
||||
if not re.match(r"^[a-z]([a-z\d\/_-])*$", args.project):
|
||||
logger.error(f"Invalid project name '{args.project}'.")
|
||||
logger.error(f"Project names must start with a lowercase letter and can consist only of lowercase numbers, digits, '/', '_', or '-' characters.")
|
||||
sys.exit(-1)
|
||||
|
||||
if not (args.leader or args.worker):
|
||||
logger.error(f"Neither leader or worker image specified...nothing to upload!")
|
||||
sys.exit(-1)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = arg_parser()
|
||||
|
||||
try:
|
||||
session = boto3.Session(profile_name=args.profile)
|
||||
except ProfileNotFound as e:
|
||||
logger.error(f"Unable to create AWS session. Profile '{profile}' not found. Please double check that this profile is set up in the ~/.aws/config file")
|
||||
sys.exit(1)
|
||||
if not session.region_name:
|
||||
logger.error(f"Profile does not have a region defined. Please add a region (recommend: us-east-1) to profile '{profile}' in the ~/.aws/config file")
|
||||
sys.exit(1)
|
||||
|
||||
# link to AWS services
|
||||
sts_client = session.client('sts')
|
||||
sts = STS(sts_client)
|
||||
|
||||
# in order to make the call, I need:
|
||||
# - the account #
|
||||
# - the region
|
||||
# - the profile
|
||||
# - the project id
|
||||
# - the leader container
|
||||
# - the worker container
|
||||
|
||||
account_number = sts.get_account_number()
|
||||
region = session.region_name
|
||||
project_name = args.project
|
||||
profile_args = ['-p', args.profile] if args.profile else []
|
||||
leader_args = ['-l', args.leader] if args.leader else []
|
||||
worker_args = ['-w', args.worker] if args.worker else []
|
||||
|
||||
|
||||
# run the shell script file to set it up.
|
||||
cmd = ' '.join(['./ecr-push-internal.sh', \
|
||||
'-j', project_name, \
|
||||
'-a', str(account_number), \
|
||||
'-r', region] + \
|
||||
profile_args + leader_args + worker_args)
|
||||
|
||||
logger.info(f"About to run: { cmd }")
|
||||
result = subprocess.run(cmd, shell=True)
|
||||
if result.returncode == 1:
|
||||
logger.error("The login command to ECR failed. Please double check that you have the correct rights in the profile.")
|
||||
exit(1)
|
||||
elif result.returncode == 2:
|
||||
logger.error(f"The attempt to tag the leader image failed. Please make sure that you have a local leader image associated with {args.leader}.")
|
||||
exit(2)
|
||||
elif result.returncode == 3:
|
||||
logger.error(f"The attempt to upload the leader failed. Please make sure that there is a repository associated with {project_name}.")
|
||||
exit(3)
|
||||
elif result.returncode == 4:
|
||||
logger.error(f"The attempt to tag the worker image failed. Please make sure that you have a local worker image associated with {args.worker}.")
|
||||
exit(4)
|
||||
elif result.returncode == 3:
|
||||
logger.error(f"The attempt to upload the worker failed. Please make sure that there is a repository associated with {project_name}.")
|
||||
exit(5)
|
||||
elif result.returncode == 0:
|
||||
logger.info("Upload succeeded. ")
|
||||
exit(0)
|
||||
else:
|
||||
logger.error(f"Unexpected return code: {result.returncode} from ecr-push-internal.sh; exiting")
|
||||
exit(6)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2
aws-batch-comp-infrastructure-sample/infrastructure/ecr-push
Executable file
@ -0,0 +1,2 @@
|
||||
#/bin/sh
|
||||
python3 docker-upload-to-ecr.py $*
|
115
aws-batch-comp-infrastructure-sample/infrastructure/ecr-push-internal.sh
Executable file
@ -0,0 +1,115 @@
|
||||
# /bin/bash
|
||||
|
||||
print_args() {
|
||||
echo "usage: ecr-push-internal.sh -p PROFILE -j PROJECT -a ACCOUNT -r REGION -l LEADER [-h] [-w WORKER]"
|
||||
echo " "
|
||||
echo "required arguments:"
|
||||
echo " -j, --project PROJECT Name of the project"
|
||||
echo " -a. --account ACCOUNT Account number"
|
||||
echo " -r, --region REGION Region"
|
||||
echo "optional arguments:"
|
||||
echo " -p, --profile PROFILE AWS profile"
|
||||
echo " -h, --help show this message and exit"
|
||||
echo " -l, --leader LEADER Name of the leader local docker image"
|
||||
echo " -w, --worker WORKER Name of the worker local docker image"
|
||||
}
|
||||
|
||||
failure() {
|
||||
echo "Failure: $2"
|
||||
exit $1
|
||||
}
|
||||
|
||||
POSITIONAL_ARGS=()
|
||||
|
||||
if [ $# -lt 4 ];
|
||||
then
|
||||
echo "args: $*"
|
||||
echo "arg count: $#"
|
||||
print_args
|
||||
exit -1;
|
||||
fi
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-l|--leader)
|
||||
LEADER="$2"
|
||||
shift # past argument
|
||||
shift # past value
|
||||
;;
|
||||
-w|--worker)
|
||||
WORKER="$2"
|
||||
shift # past argument
|
||||
shift # past value
|
||||
;;
|
||||
-p|--profile)
|
||||
PROFILE="$2"
|
||||
shift # past argument
|
||||
shift # past value
|
||||
;;
|
||||
-j|--project)
|
||||
PROJECT="$2"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-a|--account)
|
||||
ACCOUNT="$2"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-r|--region)
|
||||
REGION="$2"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "ABOUT TO RUN HELP"
|
||||
print_args
|
||||
shift # past argument
|
||||
;;
|
||||
-*|--*)
|
||||
echo "Unknown option $1"
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "Unknown positional argument $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# echo "Leader is: " $LEADER
|
||||
# echo "Worker is: " $WORKER
|
||||
# echo "Project is: " $PROJECT
|
||||
# echo "Profile is: " $PROFILE
|
||||
|
||||
if [ -z ${PROJECT+x} ] || [ -z ${ACCOUNT+x} ] || [ -z ${REGION+x} ] ;
|
||||
then
|
||||
echo "account, region, project and profile are all required arguments"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ -z ${PROFILE+x} ] ;
|
||||
then
|
||||
echo "Profile not set.";
|
||||
else
|
||||
PROFILE_ARG="--profile ${PROFILE}"
|
||||
echo "Profile arg is: $PROFILE_ARG";
|
||||
fi
|
||||
|
||||
aws $PROFILE_ARG --region $REGION ecr get-login-password | docker login --username AWS --password-stdin "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com || failure 1 "Line ${LINENO}"
|
||||
|
||||
if [ -z ${LEADER+x} ];
|
||||
then
|
||||
echo "No leader set";
|
||||
else
|
||||
docker tag "$LEADER" "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":leader || failure 2 "Line ${LINENO}"
|
||||
docker push "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":leader || failure 3 "Line ${LINENO}"
|
||||
fi
|
||||
|
||||
if [ -z ${WORKER+x} ];
|
||||
then
|
||||
echo "No worker set";
|
||||
else
|
||||
docker tag "$WORKER" "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":worker || failure 4 "Line ${LINENO}"
|
||||
docker push "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":worker || failure 5 "Line ${LINENO}"
|
||||
fi
|
166
aws-batch-comp-infrastructure-sample/infrastructure/ecs-config
Executable file
@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
import time
|
||||
import pprint
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Runner")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class EcsService:
|
||||
def __init__(self, client):
|
||||
self.ecs = client
|
||||
|
||||
def get_ecs_service(self, service_name):
|
||||
"""Get ECS services and return Worker service name
|
||||
|
||||
Returns: ECS worker node servicename
|
||||
|
||||
"""
|
||||
try:
|
||||
response = self.ecs.list_services(
|
||||
cluster='SatCompCluster',
|
||||
)
|
||||
|
||||
for service in response['serviceArns']:
|
||||
if service_name in service:
|
||||
return service
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get ECS service names: {e}")
|
||||
raise e
|
||||
|
||||
def update_ecs_service(self, leader_node_count, worker_node_count):
|
||||
worker_service = self.get_ecs_service("SolverWorkerService")
|
||||
leader_service = self.get_ecs_service("SolverLeaderService")
|
||||
try:
|
||||
ecs.update_service(
|
||||
cluster="SatCompCluster",
|
||||
service=leader_service,
|
||||
desiredCount=leader_node_count
|
||||
)
|
||||
ecs.update_service(
|
||||
cluster="SatCompCluster",
|
||||
service=worker_service,
|
||||
desiredCount=worker_node_count
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update ECS service: {e}")
|
||||
raise e
|
||||
|
||||
def describe_ecs_services(self):
|
||||
result = {}
|
||||
worker_service = self.get_ecs_service("SolverWorkerService")
|
||||
leader_service = self.get_ecs_service("SolverLeaderService")
|
||||
try:
|
||||
result = ecs.describe_services(
|
||||
cluster="SatCompCluster",
|
||||
services=[leader_service, worker_service]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to describe ECS service: {e}")
|
||||
raise e
|
||||
return result
|
||||
|
||||
class ScalingGroup:
|
||||
def __init__(self, client) -> None:
|
||||
self.asg_client = client
|
||||
|
||||
def update_asg(self, desired_count: str):
|
||||
try:
|
||||
response = self.asg_client.describe_auto_scaling_groups()['AutoScalingGroups']
|
||||
for group in response:
|
||||
if 'EcsInstanceAsg' in group["AutoScalingGroupName"]:
|
||||
asg_name = group["AutoScalingGroupName"]
|
||||
|
||||
response = self.asg_client.update_auto_scaling_group(
|
||||
AutoScalingGroupName= asg_name,
|
||||
MaxSize=desired_count,
|
||||
DesiredCapacity=desired_count,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update ASG: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
def await_completion(ecs_service, asg_client):
|
||||
# wait for ECS setup/teardown to complete
|
||||
start = time.time()
|
||||
while True:
|
||||
status = ecs_service.describe_ecs_services()
|
||||
|
||||
leader = status["services"][0]["deployments"][0]
|
||||
leader_desired = leader["desiredCount"]
|
||||
leader_pending = leader["pendingCount"]
|
||||
leader_running = leader["runningCount"]
|
||||
|
||||
worker = status["services"][1]["deployments"][0]
|
||||
worker_desired = worker["desiredCount"]
|
||||
worker_pending = worker["pendingCount"]
|
||||
worker_running = worker["runningCount"]
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Waiting for ECS ({elapsed/60.:3.1f} mins)")
|
||||
print(f" leader: {leader_desired} desired, {leader_pending} pending, {leader_running} running")
|
||||
print(f" workers: {worker_desired} desired, {worker_pending} pending, {worker_running} running")
|
||||
print("")
|
||||
|
||||
if (leader_desired==leader_running and worker_desired==worker_running):
|
||||
print("ECS configuration complete")
|
||||
return
|
||||
|
||||
time.sleep(30)
|
||||
|
||||
# MWW: I am disabling this output since the output 'lies' in the sense that it reports
|
||||
# a failure in the usual case, before eventually succeeding.
|
||||
|
||||
# put this after the first sleep, since it's usually delayed
|
||||
# asg_status = asg_client.describe_scaling_activities()
|
||||
# only display the most recent message
|
||||
# asg_update = asg_status["Activities"][0]
|
||||
# print(f"Most recent AutoScaling Activity Log")
|
||||
# print(f" StatusCode: {asg_update['StatusCode']}")
|
||||
# print(f" StatusCause: {asg_update['Cause']}")
|
||||
# print(f" StatusDescription: {asg_update['Description']}")
|
||||
# print("")
|
||||
#pprint.pprint(asg_update)
|
||||
#print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('mode', choices = ["setup", "shutdown"], help = "AddInstances / DeleteInstances.")
|
||||
parser.add_argument('--profile', default = "default", required = False, help = "AWS profile")
|
||||
parser.add_argument('--workers', required = False, help = "Required Worker nodes count")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == 'setup':
|
||||
# Setup Instances
|
||||
worker_node_count = args.workers
|
||||
leader_node_count = "1"
|
||||
desired_count = str(int(worker_node_count)+1)
|
||||
elif args.mode == 'shutdown':
|
||||
# Shutdown instances
|
||||
leader_node_count = worker_node_count = desired_count = "0"
|
||||
|
||||
session = boto3.Session(profile_name=args.profile)
|
||||
ecs = session.client('ecs')
|
||||
ecs_service = EcsService(ecs)
|
||||
|
||||
asg_client = session.client('autoscaling')
|
||||
asg = ScalingGroup(asg_client)
|
||||
|
||||
asg.update_asg(int(desired_count))
|
||||
try:
|
||||
ecs_service.update_ecs_service(int(leader_node_count), int(worker_node_count))
|
||||
except Exception as e:
|
||||
logger.info(f"Failed to update ECS service. {e}")
|
||||
logger.info("Updating ASG")
|
||||
asg.update_asg("0")
|
||||
|
||||
# wait for ECS setup/teardown to complete
|
||||
await_completion(ecs_service, asg_client)
|
@ -0,0 +1,333 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError, ProfileNotFound
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Runner")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class STS:
|
||||
def __init__(self, sts_client) -> None:
|
||||
self.sts = sts_client
|
||||
|
||||
def get_account_number(self) -> str:
|
||||
try:
|
||||
return self.sts.get_caller_identity()["Account"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get profile account number: {e}")
|
||||
raise e
|
||||
|
||||
class ECR:
|
||||
def __init__(self, ecr_client) -> None:
|
||||
self.ecr = ecr_client
|
||||
|
||||
def delete_repository_images(self, repository_name) -> None:
|
||||
try:
|
||||
images = self.ecr.list_images(
|
||||
repositoryName=repository_name
|
||||
)
|
||||
if images["imageIds"]:
|
||||
self.ecr.batch_delete_image(
|
||||
repositoryName=repository_name,
|
||||
imageIds=images["imageIds"]
|
||||
)
|
||||
except self.ecr.exceptions.RepositoryNotFoundException as e:
|
||||
logger.info(f"Repository {repository_name} not found (already deleted). Error: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete repository images: {e}")
|
||||
raise e
|
||||
|
||||
def delete_project_repositories(self, project_name) -> None:
|
||||
# These must be kept synchronized with Cfn.
|
||||
repository_name = project_name
|
||||
self.delete_repository_images(repository_name)
|
||||
|
||||
|
||||
class S3Filesystem:
|
||||
def __init__(self, s3_client) -> None:
|
||||
self.s3 = s3_client
|
||||
|
||||
def delete_bucket(self, bucket_name) -> None:
|
||||
try:
|
||||
bucket = self.s3.Bucket(bucket_name)
|
||||
bucket.objects.all().delete()
|
||||
bucket.object_versions.delete()
|
||||
bucket.Policy().delete()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete all objects in bucket: {e}")
|
||||
raise e
|
||||
|
||||
def upload_file_to_s3(self, bucket_name: str, file_name: str, object_name=None) -> None:
|
||||
"""Upload a file to an S3 bucket
|
||||
|
||||
:param file_name: File to upload
|
||||
:param bucket: Bucket to upload to
|
||||
:param object_name: S3 object name. If not specified then file_name is used
|
||||
:return: True if file was uploaded, else False
|
||||
"""
|
||||
|
||||
# If S3 object_name was not specified, use file_name
|
||||
if object_name is None:
|
||||
object_name = file_name
|
||||
|
||||
# Upload the file
|
||||
try:
|
||||
bucket = self.s3.Bucket(bucket_name)
|
||||
bucket.upload_file(file_name, object_name)
|
||||
except ClientError as e:
|
||||
logging.error(f"Failed to upload file to s3: {e}")
|
||||
raise e
|
||||
|
||||
class SSM:
|
||||
def __init__(self, ssm_client) -> None:
|
||||
self.ssm = ssm_client
|
||||
|
||||
# Get the recommended AMI.
|
||||
# The instance AMI describes the
|
||||
# [Amazon Machine Image](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html),
|
||||
# the basic software image to be loaded onto each EC2 instance, that describes the
|
||||
# operating system and other configuration information.
|
||||
def get_ami_image(self) -> str:
|
||||
try:
|
||||
result = self.ssm.get_parameters(
|
||||
Names=["/aws/service/ecs/optimized-ami/amazon-linux-2/recommended"])
|
||||
logger.debug(f"Result of get_parameters is {result}")
|
||||
result_value = json.loads(result["Parameters"][0]["Value"])
|
||||
image_id = result_value["image_id"]
|
||||
logger.info(f"Recommended image id: {image_id}")
|
||||
return result_value["image_id"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to call ssm.get_parameters to find the recommended AMI for linux: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
class CloudFormation:
|
||||
def __init__(self, cfn_client, stack_name: str) -> None:
|
||||
self.stack_name = stack_name
|
||||
self.cfn = cfn_client
|
||||
|
||||
def create_parameters(self, project_name, instance_type, ami_id, container_memory):
|
||||
return [
|
||||
{
|
||||
'ParameterKey': "ProjectName",
|
||||
'ParameterValue': project_name
|
||||
},
|
||||
{
|
||||
'ParameterKey': "AvailZoneId",
|
||||
'ParameterValue': "a"
|
||||
},
|
||||
{
|
||||
'ParameterKey': "InstanceType",
|
||||
'ParameterValue': instance_type
|
||||
},
|
||||
{
|
||||
'ParameterKey': "AmiImageId",
|
||||
'ParameterValue': ami_id
|
||||
},
|
||||
{
|
||||
'ParameterKey': "ContainerMemory",
|
||||
'ParameterValue': str(container_memory)
|
||||
},
|
||||
]
|
||||
|
||||
def get_stack(self):
|
||||
return self.cfn.Stack(self.stack_name)
|
||||
|
||||
def create_cloudformation_stack(self, project_name, instance_type, ami_id, container_memory) -> None:
|
||||
try:
|
||||
cf_template = open('solver-infrastructure.yaml').read()
|
||||
response = self.cfn.create_stack(
|
||||
StackName=self.stack_name,
|
||||
TemplateBody=cf_template,
|
||||
Capabilities=["CAPABILITY_IAM"],
|
||||
Parameters=self.create_parameters(project_name, instance_type, ami_id, container_memory)
|
||||
)
|
||||
except self.cfn.exceptions.AlreadyExistsException as exists:
|
||||
logger.error("Stack already exists: perhaps it failed to properly create? Try deleting it with delete-solver-infrastructure.")
|
||||
raise e
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create stack: {e}")
|
||||
raise e
|
||||
|
||||
def delete_cloudformation_stack(self) -> None:
|
||||
try:
|
||||
stack = self.cfn.Stack(self.stack_name)
|
||||
stack.delete()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete stack: {e}")
|
||||
raise e
|
||||
|
||||
def update_cloudformation_stack(self, project_name, instance_type, ami_id, container_memory) -> None:
|
||||
try:
|
||||
cf_template = open('solver-infrastructure.yaml').read()
|
||||
stack = self.cfn.Stack(self.stack_name)
|
||||
response = stack.update(
|
||||
StackName=self.stack_name,
|
||||
TemplateBody=cf_template,
|
||||
Capabilities=["CAPABILITY_IAM"],
|
||||
Parameters=self.create_parameters(project_name, instance_type, ami_id, container_memory)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create stack: {e}")
|
||||
raise e
|
||||
|
||||
def await_completion(self, mode) -> bool:
|
||||
while True:
|
||||
stack = self.get_stack()
|
||||
try:
|
||||
status = stack.stack_status
|
||||
except Exception as e:
|
||||
logger.info(f"Stack {mode} operation in progress. Cloudformation states stack is deleted.")
|
||||
return mode == 'delete'
|
||||
|
||||
logger.info(f"Stack {mode} operation in progress. Current status: {stack.stack_status}")
|
||||
|
||||
if "IN_PROGRESS" in stack.stack_status:
|
||||
logger.info("Pausing 30 seconds")
|
||||
# Sleep time added to reduce cloudformation api calls to get status
|
||||
time.sleep(30)
|
||||
elif ("ROLLBACK" in stack.stack_status or
|
||||
"FAILED" in stack.stack_status):
|
||||
logger.info(f"Cloudformation operation {mode} failed!")
|
||||
return False
|
||||
elif "COMPLETE" in stack.stack_status:
|
||||
logger.info(f"Cloudformation operation {mode} succeeded!")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Unexpected cloudformation state {stack.stack_status}")
|
||||
raise Exception(f"Unexpected cloudformation state when awaiting completion: {stack.stack_status}")
|
||||
|
||||
def get_satcomp_bucket(account_number, region, project_name) -> str:
|
||||
# needs to be synched with CloudFormation
|
||||
return str(account_number) + '-' + region + '-' + project_name
|
||||
|
||||
def delete_resources(session, s3, project_name, bucket) -> None:
|
||||
# delete s3 bucket
|
||||
|
||||
logger.info(f"Deleting S3 bucket {bucket}")
|
||||
s3_client = session.client('s3')
|
||||
|
||||
try:
|
||||
s3.delete_bucket(bucket)
|
||||
except s3_client.exceptions.NoSuchBucket as e:
|
||||
logger.info(f"No bucket {bucket} exists; perhaps it was already deleted.")
|
||||
|
||||
# delete ecr instances
|
||||
ecr_client = session.client('ecr')
|
||||
ecr = ECR(ecr_client)
|
||||
logger.info(f"Deleting all images within ECR repository")
|
||||
ecr.delete_project_repositories(project_name)
|
||||
|
||||
def arg_parser() -> dict:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--profile', required = False, help = "AWS profile (uses 'default' if not provided)")
|
||||
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
|
||||
parser.add_argument('--solver-type', required = True, choices = ('cloud', 'parallel'), help = "Either 'cloud' or 'parallel' depending on the desired configuration.")
|
||||
parser.add_argument('--mode', required = False, default = 'create', choices = ('create', 'update', 'delete'), help = "One of 'create', 'update', 'delete': create, update or delete the infrastructure.")
|
||||
# MWW: can add these back in for advanced users, but hiding for now.
|
||||
# parser.add_argument('--dryrun', type=bool, required = False, help = "Dry run ONLY. Do not actually create resources.")
|
||||
# parser.add_argument('--instance', required = False, help = "EXPERTS ONLY: Override default machine instance type")
|
||||
# parser.add_argument('--ami', required = False, help = "EXPERTS ONLY: Override default instance AMI")
|
||||
# parser.add_argument('--memory', type=int, required = True, help = "EXPERTS ONLY: Override default max memory for container (61000 for cloud, 253000 for parallel)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# argument checking.
|
||||
# matcher = re.compile(allowable_project_names_mask)
|
||||
if not re.match(r"^[a-z]([a-z\d\/_-])*$", args.project):
|
||||
logger.error(f"Invalid project name '{args.project}'.")
|
||||
logger.error(f"Project names must start with a lowercase letter and can consist only of lowercase numbers, digits, '/', '_', or '-' characters.")
|
||||
sys.exit(-1)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = arg_parser()
|
||||
|
||||
if args.solver_type == "cloud":
|
||||
instance_type = 'm6i.4xlarge'
|
||||
memory = '61000'
|
||||
elif args.solver_type == "parallel":
|
||||
instance_type = 'm6i.16xlarge'
|
||||
memory = '253000'
|
||||
else:
|
||||
raise Exception('Solver type argument must be one of "cloud" or "parallel"')
|
||||
|
||||
profile = args.profile
|
||||
#profile = "default"
|
||||
project_name = args.project
|
||||
|
||||
stack_name = f"solver-infrastructure-{project_name}"
|
||||
|
||||
#RBJ#
|
||||
try:
|
||||
if profile:
|
||||
session = boto3.Session(profile_name=profile)
|
||||
else:
|
||||
session = boto3.Session()
|
||||
except ProfileNotFound as e:
|
||||
logger.error(f"Unable to create AWS session. Please check that default profile is set up in the ~/.aws/config file and has appropriate rights (or if --profile was provided, that this profile has appropriate rights)")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if not session.region_name:
|
||||
logger.error(f"Profile does not have a region defined. Please add a region (recommend: us-east-1) to profile '{profile}' in the ~/.aws/config file")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# link to AWS services
|
||||
ssm_client = session.client('ssm')
|
||||
ssm = SSM(ssm_client)
|
||||
|
||||
sts_client = session.client('sts')
|
||||
sts = STS(sts_client)
|
||||
|
||||
s3 = session.resource('s3')
|
||||
s3_file_system = S3Filesystem(s3)
|
||||
|
||||
cloudformation = session.resource('cloudformation')
|
||||
cfn = CloudFormation(cloudformation, stack_name)
|
||||
|
||||
# set up parameters
|
||||
account_number = sts.get_account_number()
|
||||
region = session.region_name
|
||||
ami_id = ssm.get_ami_image()
|
||||
satcomp_bucket = get_satcomp_bucket(account_number, region, project_name)
|
||||
|
||||
# logger.info("Exiting early so as not to actually do anything")
|
||||
# return
|
||||
|
||||
if args.mode == "update":
|
||||
cfn.update_cloudformation_stack(project_name, instance_type, ami_id, memory)
|
||||
cfn.await_completion("update")
|
||||
elif args.mode == "delete":
|
||||
delete_resources(session, s3_file_system, project_name, satcomp_bucket)
|
||||
cfn.delete_cloudformation_stack()
|
||||
cfn.await_completion("delete")
|
||||
elif args.mode == "create":
|
||||
cfn.create_cloudformation_stack(project_name, instance_type, ami_id, memory)
|
||||
ok = cfn.await_completion("create")
|
||||
if ok:
|
||||
logger.info(f"Uploading test.cnf file to the {satcomp_bucket} bucket")
|
||||
s3_file_system.upload_file_to_s3(satcomp_bucket, "test.cnf")
|
||||
|
||||
else:
|
||||
logger.error(f"Unexpected operation {args.mode}")
|
||||
raise Exception("Internal error: unexpected operation {args.mode}")
|
||||
except ClientError as ce:
|
||||
logger.error("An error occurred during an AWS operation. Usually this is caused by the AWS session having insufficient rights to resources. Please double check that the default profile is properly set up in your AWS Config.")
|
||||
logger.error("Error description: {ce}")
|
||||
raise ce
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
81
aws-batch-comp-infrastructure-sample/infrastructure/quickstart-build
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Quickstart Build")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--profile', required = False, help = "AWS profile")
|
||||
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
|
||||
args = parser.parse_args()
|
||||
|
||||
profile_args = ['--profile', args.profile] if args.profile else []
|
||||
project_args = ['--project', args.project]
|
||||
|
||||
# run the create script
|
||||
cmd = ['python3', 'manage-solver-infrastructure.py', \
|
||||
'--solver-type', 'cloud', \
|
||||
'--mode', 'create'] \
|
||||
+ profile_args + project_args
|
||||
logger.info(f"Creating the AWS infrastructure for project {args.project} using create-solver-project.")
|
||||
logger.info("This operation will likely take 5-7 minutes.")
|
||||
try:
|
||||
result = subprocess.run(cmd)
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: Unable to run {cmd}. Is python3 installed and in your path?")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error(f"create-solver-infrastructure failed with error code {result}")
|
||||
logger.error("Unable to create solver infrastructure...has it already been created?")
|
||||
logger.error("If not, have you set up the default profile in your ~/.aws/config file?")
|
||||
logger.error("If so, does your project name meet the expected format?")
|
||||
exit(-1)
|
||||
|
||||
logger.info("Building the Docker images for the competition base containers")
|
||||
cmd = ['cd ../docker/satcomp-images && ./build_satcomp_images.sh && cd ../../infrastructure']
|
||||
try:
|
||||
result = subprocess.run(cmd, shell=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: unable to run command to build base image: {cmd}.")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error("Unexpected error: Unable to build docker images for base containers.")
|
||||
exit(-1)
|
||||
|
||||
logger.info("Building the Docker images for Mallob. This will require ~10 minutes.")
|
||||
cmd = ['cd ../docker/mallob-images && ./build_mallob_images.sh && cd ../../infrastructure']
|
||||
try:
|
||||
result = subprocess.run(cmd, shell=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: unable to run command to build mallob image: {cmd}.")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error("Unexpected error: Unable to build docker images for base containers.")
|
||||
exit(-1)
|
||||
|
||||
cmd = ['python3', 'docker-upload-to-ecr.py', \
|
||||
'--leader', 'satcomp-mallob:leader', \
|
||||
'--worker', 'satcomp-mallob:worker'] \
|
||||
+ profile_args + project_args
|
||||
logger.info("Uploading the Mallob Docker files to ECR.")
|
||||
logger.info("This operation will likely take 2-5 minutes, depending on network bandwidth.")
|
||||
try:
|
||||
result = subprocess.run(cmd)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: Unable to upload Mallob docker images to ECR.")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error(f"Unexpected error (return code {result}). Unable to upload Mallob docker images to ECR")
|
||||
exit(-1)
|
||||
|
||||
logger.info("Upload complete; quickstart-build successful!")
|
||||
exit(0)
|
||||
|
114
aws-batch-comp-infrastructure-sample/infrastructure/quickstart-run
Executable file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
import json
|
||||
import subprocess
|
||||
from botocore.exceptions import ProfileNotFound
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Quickstart Run")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
WORKER_NODES = 3
|
||||
|
||||
class STS:
|
||||
def __init__(self, sts_client) -> None:
|
||||
self.sts = sts_client
|
||||
|
||||
def get_account_number(self) -> str:
|
||||
try:
|
||||
return self.sts.get_caller_identity()["Account"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get profile account number: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
def argument_parser():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--profile', required = False, help = "AWS profile")
|
||||
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
|
||||
parser.add_argument('--keep-alive', required = False, help = "If True, then cluster is not destroyed after solving.")
|
||||
parser.add_argument('--s3-locations', nargs='+', required = False, help = "S3 URLs for problem location of the form: s3://BUCKET_NAME/OBJECT_LOCATION. Default is to run test.cnf from the default bucket. You can specify multiple URLs (space separated)")
|
||||
return parser
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argument_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
if args.profile:
|
||||
session = boto3.Session(profile_name=args.profile)
|
||||
else:
|
||||
session = boto3.Session()
|
||||
except ProfileNotFound as e:
|
||||
logger.error(f"Unable to create AWS session. Please check that default profile is set up in the ~/.aws/config file and has appropriate rights (or if --profile was provided, that this profile has appropriate rights)")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
sts = STS(session.client('sts'))
|
||||
account_number = sts.get_account_number()
|
||||
|
||||
if not args.s3_locations:
|
||||
s3_bucket = f"s3://{account_number}-us-east-1-{args.project}"
|
||||
problem_locations = [f"{s3_bucket}/test.cnf"]
|
||||
else:
|
||||
problem_locations = args.s3_locations
|
||||
|
||||
profile_args = ['--profile', args.profile] if args.profile else []
|
||||
|
||||
# run the ecs-config script
|
||||
cmd = ['python3', 'ecs-config',
|
||||
'--workers', str(WORKER_NODES), \
|
||||
'setup'] \
|
||||
+ profile_args
|
||||
logger.info(f"Setting up ECS cluster with {WORKER_NODES} worker nodes using ecs-config.")
|
||||
logger.info(f"command that is being executed is: {' '.join(cmd)}")
|
||||
logger.info("This operation will likely take 5-7 minutes.")
|
||||
logger.info("***Note that while configuration is in process, the system will report Autoscaling failures. This is normal, but should not persist for more than 10 minutes.***")
|
||||
try:
|
||||
result = subprocess.run(cmd)
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: Unable to run {cmd}. ")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error(f"ecs-config failed with error code {result}")
|
||||
logger.error("Have you set up the default profile in your ~/.aws/config file?")
|
||||
exit(-1)
|
||||
|
||||
for problem_location in problem_locations:
|
||||
cmd = ['python3', 'send_message', '--location', problem_location, '--workers', str(WORKER_NODES), '--await-response', 'True'] + profile_args
|
||||
|
||||
logger.info("Sending a message to the cluster to run the `temp.cnf' problem.")
|
||||
logger.info("It is stored in the S3 location: {problem_location}")
|
||||
logger.info("Please go inspect the CloudWatch logs for project {args.project} to see it running, as described in the Infrastructure README.")
|
||||
logger.info(f"command that is being executed is: {' '.join(cmd)}")
|
||||
try:
|
||||
result = subprocess.run(cmd)
|
||||
logger.info(f"Send message completed. Intermediate files and stdout/stderr are available in {s3_bucket}/tmp")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: unable to run command to send message.")
|
||||
if result.returncode:
|
||||
logger.error("Unexpected error: Command to send message to queue failed.")
|
||||
|
||||
if args.keep_alive:
|
||||
logger.info("Keep-alive option selected; not deleting cluster")
|
||||
else:
|
||||
logger.info("Deleting the cluster. This will require a minute or two.")
|
||||
cmd = ['python3', 'ecs-config', 'shutdown'] + profile_args
|
||||
logger.info("Tearing down the cluster.")
|
||||
logger.info(f"command that is being executed is: {' '.join(cmd)}")
|
||||
try:
|
||||
result = subprocess.run(cmd)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error {e}: unable to run command: {cmd}.")
|
||||
exit(-1)
|
||||
if result.returncode:
|
||||
logger.error("Unexpected error returned by cluster teardown. PLEASE MANUALLY CHECK WHETHER CLUSTER WAS DELETED.")
|
||||
exit(-1)
|
||||
|
||||
|
||||
logger.info("Run complete; quickstart-run successful!")
|
||||
exit(0)
|
||||
|
After Width: | Height: | Size: 137 KiB |
After Width: | Height: | Size: 83 KiB |
After Width: | Height: | Size: 214 KiB |
After Width: | Height: | Size: 67 KiB |
After Width: | Height: | Size: 62 KiB |
After Width: | Height: | Size: 98 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 48 KiB |
After Width: | Height: | Size: 178 KiB |
172
aws-batch-comp-infrastructure-sample/infrastructure/send_message
Executable file
@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import boto3
|
||||
import json
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("Runner")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
QUEUE_WAIT_TIME = 10
|
||||
|
||||
|
||||
|
||||
class SqsService:
|
||||
def __init__(self, client):
|
||||
self.sqs = client
|
||||
|
||||
def get_satcomp_queue(self):
|
||||
"""Get SQS services and return sat comp queue.
|
||||
|
||||
Returns: SQS SatCompQueue
|
||||
|
||||
"""
|
||||
try:
|
||||
response = self.sqs.list_queues()
|
||||
for service in response['QueueUrls']:
|
||||
if service.endswith('SatCompQueue'):
|
||||
return service
|
||||
|
||||
raise "No queue ending with 'SatCompQueue' "
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get SQS queue: {e}")
|
||||
raise e
|
||||
|
||||
def get_satcomp_output_queue(self):
|
||||
"""Get SQS services and return sat comp queue.
|
||||
|
||||
Returns: SQS SatCompQueue
|
||||
|
||||
"""
|
||||
try:
|
||||
response = self.sqs.list_queues()
|
||||
for service in response['QueueUrls']:
|
||||
if service.endswith('SatCompOutputQueue'):
|
||||
return service
|
||||
|
||||
raise "No queue ending with 'SatCompOutputQueue' "
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get SQS queue: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
def send_message(self, location, workers, timeout, solverName, language, solverOptions):
|
||||
# Expected message structure:
|
||||
"""{
|
||||
"formula" : {
|
||||
"value" : <s3 url>,
|
||||
"language": "SMTLIB2" | "DIMACS",
|
||||
},
|
||||
"solverConfig" : {
|
||||
"solverName" : "",
|
||||
"solverOptions" : [],
|
||||
"taskTimeoutSeconds" : 5
|
||||
},
|
||||
"num_workers": 0
|
||||
}"""
|
||||
queue = self.get_satcomp_queue()
|
||||
|
||||
message_body = { \
|
||||
"formula": { \
|
||||
"value": location, \
|
||||
"language": language \
|
||||
}, \
|
||||
"solverConfig" : { \
|
||||
"solverName" : solverName, \
|
||||
"solverOptions" : solverOptions, \
|
||||
"taskTimeoutSeconds" : timeout, \
|
||||
}, \
|
||||
"num_workers": workers \
|
||||
}
|
||||
|
||||
message_body_str = json.dumps(message_body, indent = 4)
|
||||
try:
|
||||
response = self.sqs.send_message(
|
||||
QueueUrl = queue,
|
||||
MessageBody = message_body_str
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send message: Exception: {e}")
|
||||
raise e
|
||||
|
||||
#
|
||||
#
|
||||
# {
|
||||
# 'Messages': [
|
||||
# {
|
||||
# 'MessageId': 'string',
|
||||
# 'ReceiptHandle': 'string',
|
||||
# 'MD5OfBody': 'string',
|
||||
# 'Body': 'string',
|
||||
# 'Attributes': {
|
||||
# 'string': 'string'
|
||||
# },
|
||||
# 'MD5OfMessageAttributes': 'string',
|
||||
# 'MessageAttributes': {
|
||||
# 'string': {
|
||||
# 'StringValue': 'string',
|
||||
# 'BinaryValue': b'bytes',
|
||||
# 'StringListValues': [
|
||||
# 'string',
|
||||
# ],
|
||||
# 'BinaryListValues': [
|
||||
# b'bytes',
|
||||
# ],
|
||||
# 'DataType': 'string'
|
||||
# }
|
||||
# }
|
||||
# },
|
||||
# ]
|
||||
# }
|
||||
def receive_and_delete_message(self, timeout):
|
||||
queue = self.get_satcomp_output_queue()
|
||||
logger.info(f"Receiving and deleting message from queue {queue}")
|
||||
total_time = 0
|
||||
while (total_time < timeout + 5):
|
||||
logger.info(f"Waiting up to 10s for a message.")
|
||||
response = self.sqs.receive_message(
|
||||
QueueUrl = queue,
|
||||
WaitTimeSeconds = QUEUE_WAIT_TIME
|
||||
)
|
||||
if response["Messages"]:
|
||||
for msg in response["Messages"]:
|
||||
body = msg["Body"]
|
||||
logger.info(f"Response from receive_message was: {body}")
|
||||
self.sqs.delete_message(
|
||||
QueueUrl = queue,
|
||||
ReceiptHandle = msg["ReceiptHandle"]
|
||||
)
|
||||
return response
|
||||
|
||||
total_time = total_time + QUEUE_WAIT_TIME
|
||||
logger.error("Solver did not complete within expected timeout.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--profile', required = False, help = "AWS profile")
|
||||
parser.add_argument('--location', required = True, help = "S3 location for CNF file")
|
||||
parser.add_argument('--workers', required = True, type=int, help = "Required Worker nodes count")
|
||||
parser.add_argument('--timeout', type=int, help = "Timeout value for the infrastructure to interrupt the solver", default = 60)
|
||||
parser.add_argument('--name', help = "Name of solver to be invoked (passed through to the solver). Default: empty string", default = "")
|
||||
parser.add_argument('--format', help = "Problem format for the problem to be solved.", default = "")
|
||||
parser.add_argument('--args', nargs='+', help="Arguments to pass through to the solver (--args accepts a space-delimited list of arguments). Default: empty list", default = [])
|
||||
parser.add_argument('--await-response', required = False, help = "If true, then solver will poll output queue for response message, display, and delete it.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
profile = args.profile
|
||||
|
||||
# Send message
|
||||
session = boto3.Session(profile_name=profile)
|
||||
sqs_client = session.client('sqs')
|
||||
sqs = SqsService(sqs_client)
|
||||
try:
|
||||
sqs.send_message(args.location, args.workers, args.timeout, args.name, args.format, args.args)
|
||||
if args.await_response:
|
||||
sqs.receive_and_delete_message(args.timeout)
|
||||
except Exception as e:
|
||||
logger.info(f"Failed to send message. {e}")
|
||||
|
@ -0,0 +1,78 @@
|
||||
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
AWSTemplateFormatVersion: 2010-09-09
|
||||
|
||||
Parameters:
|
||||
emailAddress:
|
||||
Type: String
|
||||
|
||||
Description: "Basic Budget"
|
||||
|
||||
Resources:
|
||||
BudgetExample:
|
||||
Type: "AWS::Budgets::Budget"
|
||||
Properties:
|
||||
Budget:
|
||||
BudgetLimit:
|
||||
Amount: 1000
|
||||
Unit: USD
|
||||
TimeUnit: MONTHLY
|
||||
BudgetType: COST
|
||||
NotificationsWithSubscribers:
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 99
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 95
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 90
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 80
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 70
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 50
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 30
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
||||
- Notification:
|
||||
NotificationType: ACTUAL
|
||||
ComparisonOperator: GREATER_THAN
|
||||
Threshold: 10
|
||||
Subscribers:
|
||||
- SubscriptionType: EMAIL
|
||||
Address: !Ref emailAddress
|
686
aws-batch-comp-infrastructure-sample/infrastructure/solver-infrastructure.yaml
Executable file
@ -0,0 +1,686 @@
|
||||
# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
---
|
||||
AWSTemplateFormatVersion: '2010-09-09'
|
||||
Description: 'AWS CloudFormation Sample Template Managed Single ECS Job Queue: This
|
||||
template demonstrates the usage of simple Job Queue and EC2 style Compute Environment
|
||||
along with multi-node jobs (on a relatively small scale: 4 instances, 16 cores each).
|
||||
N.B.: This is all boilerplate until the EcsInstanceRole! '
|
||||
|
||||
Parameters:
|
||||
ProjectName:
|
||||
Type: String
|
||||
Default: "proofs"
|
||||
Description: "S3 bucket will be AccountId-Region-ProjectName"
|
||||
|
||||
AvailZoneId:
|
||||
Type: String
|
||||
Default: "a"
|
||||
Description: "Availability Zone ID"
|
||||
|
||||
InstanceType:
|
||||
Type: String
|
||||
Default: "m6i.4xlarge"
|
||||
Description: "Instance type"
|
||||
|
||||
ContainerMemory:
|
||||
Type: String
|
||||
Description: "Memory Size for containers (61000 for cloud, 253000 for parallel)"
|
||||
|
||||
AmiImageId:
|
||||
Type: String
|
||||
Description: "Ami for your instances"
|
||||
|
||||
Resources:
|
||||
|
||||
VPC:
|
||||
Type: AWS::EC2::VPC
|
||||
Properties:
|
||||
CidrBlock: 10.0.0.0/16
|
||||
EnableDnsHostnames: true
|
||||
EnableDnsSupport: true
|
||||
|
||||
SecondCidr:
|
||||
Type: AWS::EC2::VPCCidrBlock
|
||||
Properties:
|
||||
CidrBlock: 10.1.0.0/16
|
||||
VpcId: !Ref VPC
|
||||
|
||||
InternetGateway:
|
||||
Type: AWS::EC2::InternetGateway
|
||||
|
||||
EIP:
|
||||
Type: 'AWS::EC2::EIP'
|
||||
Properties:
|
||||
Domain: vpc
|
||||
|
||||
|
||||
VPCGatewayAttachment:
|
||||
Type: AWS::EC2::VPCGatewayAttachment
|
||||
Properties:
|
||||
VpcId:
|
||||
Ref: VPC
|
||||
InternetGatewayId:
|
||||
Ref: InternetGateway
|
||||
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: EC2 Security Group for instances launched in the VPC by Batch
|
||||
VpcId:
|
||||
Ref: VPC
|
||||
SecurityGroupIngress:
|
||||
- CidrIp: 10.0.0.0/0
|
||||
Description: SSH port
|
||||
FromPort: 0
|
||||
IpProtocol: TCP
|
||||
ToPort: 65535
|
||||
|
||||
NatGateway:
|
||||
Type: 'AWS::EC2::NatGateway'
|
||||
Properties:
|
||||
AllocationId: !GetAtt 'EIP.AllocationId'
|
||||
SubnetId: !Ref SubnetPublic
|
||||
|
||||
NatAttachment:
|
||||
Type: AWS::EC2::VPCGatewayAttachment
|
||||
Properties:
|
||||
InternetGatewayId: !Ref InternetGateway
|
||||
VpcId: !Ref VPC
|
||||
|
||||
SubnetPrivate:
|
||||
Type: AWS::EC2::Subnet
|
||||
DependsOn: SecondCidr
|
||||
Properties:
|
||||
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
|
||||
CidrBlock: 10.1.0.0/16
|
||||
VpcId: !Ref VPC
|
||||
RouteTablePrivate:
|
||||
Type: AWS::EC2::RouteTable
|
||||
Properties:
|
||||
VpcId:
|
||||
Ref: VPC
|
||||
RoutePrivate:
|
||||
Type: AWS::EC2::Route
|
||||
Properties:
|
||||
RouteTableId:
|
||||
Ref: RouteTablePrivate
|
||||
DestinationCidrBlock: 0.0.0.0/0
|
||||
NatGatewayId: !Ref NatGateway
|
||||
|
||||
SubnetRouteTableAssociationPrivate:
|
||||
Type: AWS::EC2::SubnetRouteTableAssociation
|
||||
Properties:
|
||||
RouteTableId: !Ref RouteTablePrivate
|
||||
SubnetId: !Ref SubnetPrivate
|
||||
|
||||
|
||||
SubnetPublic:
|
||||
Type: AWS::EC2::Subnet
|
||||
Properties:
|
||||
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
|
||||
CidrBlock: 10.0.1.0/24
|
||||
VpcId: !Ref VPC
|
||||
MapPublicIpOnLaunch: 'True'
|
||||
RouteTablePublic:
|
||||
Type: AWS::EC2::RouteTable
|
||||
Properties:
|
||||
VpcId:
|
||||
Ref: VPC
|
||||
RoutePublic:
|
||||
Type: AWS::EC2::Route
|
||||
Properties:
|
||||
RouteTableId:
|
||||
Ref: RouteTablePublic
|
||||
DestinationCidrBlock: 0.0.0.0/0
|
||||
GatewayId: !Ref InternetGateway
|
||||
|
||||
SubnetRouteTableAssociationPublic:
|
||||
Type: AWS::EC2::SubnetRouteTableAssociation
|
||||
Properties:
|
||||
RouteTableId: !Ref RouteTablePublic
|
||||
SubnetId: !Ref SubnetPublic
|
||||
|
||||
EcsCluster:
|
||||
Type: AWS::ECS::Cluster
|
||||
Properties:
|
||||
ClusterName: SatCompCluster
|
||||
ClusterSettings:
|
||||
- Name: containerInsights
|
||||
Value: enabled
|
||||
|
||||
Ec2AutoscaleInstanceProfile:
|
||||
Type: "AWS::IAM::InstanceProfile"
|
||||
Properties:
|
||||
Path: "/"
|
||||
Roles:
|
||||
- Ref: "InstanceRole"
|
||||
InstanceRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
ManagedPolicyArns:
|
||||
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
|
||||
AssumeRolePolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: [ ec2.amazonaws.com ]
|
||||
Action: [ 'sts:AssumeRole' ]
|
||||
Path: /
|
||||
Policies:
|
||||
- PolicyName: ecs-service
|
||||
PolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Action:
|
||||
# Rules which allow ECS to attach network interfaces to instances
|
||||
# on your behalf in order for awsvpc networking mode to work right
|
||||
- 'ec2:AttachNetworkInterface'
|
||||
- 'ec2:CreateNetworkInterface'
|
||||
- 'ec2:CreateNetworkInterfacePermission'
|
||||
- 'ec2:DeleteNetworkInterface'
|
||||
- 'ec2:DeleteNetworkInterfacePermission'
|
||||
- 'ec2:Describe*'
|
||||
- 'ec2:DetachNetworkInterface'
|
||||
- 'elasticfilesystem:*'
|
||||
- 'cloudwatch:*'
|
||||
- 'ecs:*'
|
||||
# Rules which allow ECS to update load balancers on your behalf
|
||||
# with the information sabout how to send traffic to your containers
|
||||
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
|
||||
- 'elasticloadbalancing:DeregisterTargets'
|
||||
- 'elasticloadbalancing:Describe*'
|
||||
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
|
||||
- 'elasticloadbalancing:RegisterTargets'
|
||||
- 's3:GetObject'
|
||||
- 's3:GetObjectVersion'
|
||||
Resource: '*'
|
||||
|
||||
EcsInstanceLc:
|
||||
Type: AWS::AutoScaling::LaunchConfiguration
|
||||
Properties:
|
||||
ImageId: !Ref AmiImageId
|
||||
InstanceType: !Select [ 0, [ !Ref InstanceType ] ]
|
||||
AssociatePublicIpAddress: true
|
||||
IamInstanceProfile: !GetAtt Ec2AutoscaleInstanceProfile.Arn
|
||||
SecurityGroups: [ !Ref SecurityGroup ]
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/xvda
|
||||
Ebs:
|
||||
VolumeSize: 30
|
||||
VolumeType: gp2
|
||||
- DeviceName: /dev/xvdcz
|
||||
Ebs:
|
||||
VolumeSize: 22
|
||||
VolumeType: gp2
|
||||
UserData:
|
||||
Fn::Base64: !Sub |
|
||||
#!/bin/bash -xe
|
||||
echo ECS_CLUSTER=${EcsCluster} >> /etc/ecs/ecs.config
|
||||
yum install -y aws-cfn-bootstrap python-pip
|
||||
pip install awscli boto3
|
||||
/opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
|
||||
|
||||
EcsInstanceAsg:
|
||||
Type: AWS::AutoScaling::AutoScalingGroup
|
||||
UpdatePolicy:
|
||||
AutoScalingRollingUpdate:
|
||||
MaxBatchSize: 10
|
||||
MinSuccessfulInstancesPercent: 95
|
||||
PauseTime: PT30M
|
||||
SuspendProcesses: [ HealthCheck, ReplaceUnhealthy, AZRebalance, AlarmNotification,
|
||||
ScheduledActions ]
|
||||
WaitOnResourceSignals: 'true'
|
||||
Properties:
|
||||
VPCZoneIdentifier: [ !Ref SubnetPrivate ]
|
||||
LaunchConfigurationName: !Ref EcsInstanceLc
|
||||
MinSize: '0'
|
||||
MaxSize: '0'
|
||||
DesiredCapacity: '0'
|
||||
Tags:
|
||||
- Key: IsAutoscaledCluster
|
||||
PropagateAtLaunch: true
|
||||
Value: "true"
|
||||
- Key: "Patch Group"
|
||||
PropagateAtLaunch: true
|
||||
Value: "ManagedClusterPatchGroup"
|
||||
|
||||
AutoscaleCapacityProvider:
|
||||
Type: AWS::ECS::CapacityProvider
|
||||
Properties:
|
||||
AutoScalingGroupProvider:
|
||||
AutoScalingGroupArn: !Ref EcsInstanceAsg
|
||||
ManagedTerminationProtection: DISABLED
|
||||
ManagedScaling:
|
||||
Status: DISABLED
|
||||
Name: AutoscaleCapacityProvider
|
||||
|
||||
# A security group for the containers we will run in Fargate.
|
||||
# Rules are added to this security group based on what ingress you
|
||||
# add for the cluster.
|
||||
ContainerSecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: Access to the Fargate containers
|
||||
VpcId: !Ref 'VPC'
|
||||
|
||||
# A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets
|
||||
# on your AWS account
|
||||
AutoscalingRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
AssumeRolePolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: [ application-autoscaling.amazonaws.com ]
|
||||
Action: [ 'sts:AssumeRole' ]
|
||||
Path: /
|
||||
Policies:
|
||||
- PolicyName: service-autoscaling
|
||||
PolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Action:
|
||||
- 'application-autoscaling:*'
|
||||
- 'cloudwatch:DescribeAlarms'
|
||||
- 'cloudwatch:PutMetricAlarm'
|
||||
- 'ecs:DescribeServices'
|
||||
- 'ecs:UpdateService'
|
||||
Resource: '*'
|
||||
|
||||
# This is an IAM role which authorizes ECS to manage resources on your
|
||||
# account on your behalf, such as updating your load balancer with the
|
||||
# details of where your containers are, so that traffic can reach your
|
||||
# containers.
|
||||
ECSRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
AssumeRolePolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: [ ecs.amazonaws.com ]
|
||||
Action: [ 'sts:AssumeRole' ]
|
||||
Path: /
|
||||
Policies:
|
||||
- PolicyName: ecs-service
|
||||
PolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Action:
|
||||
# Rules which allow ECS to attach network interfaces to instances
|
||||
# on your behalf in order for awsvpc networking mode to work right
|
||||
- 'ec2:AttachNetworkInterface'
|
||||
- 'ec2:CreateNetworkInterface'
|
||||
- 'ec2:CreateNetworkInterfacePermission'
|
||||
- 'ec2:DeleteNetworkInterface'
|
||||
- 'ec2:DeleteNetworkInterfacePermission'
|
||||
- 'ec2:Describe*'
|
||||
- 'ec2:DetachNetworkInterface'
|
||||
- 'elasticfilesystem:*'
|
||||
|
||||
# Rules which allow ECS to update load balancers on your behalf
|
||||
# with the information sabout how to send traffic to your containers
|
||||
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
|
||||
- 'elasticloadbalancing:DeregisterTargets'
|
||||
- 'elasticloadbalancing:Describe*'
|
||||
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
|
||||
- 'elasticloadbalancing:RegisterTargets'
|
||||
Resource: '*'
|
||||
|
||||
SsmVpcEndpoint:
|
||||
Type: AWS::EC2::VPCEndpoint
|
||||
Properties:
|
||||
VpcEndpointType: Interface
|
||||
ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ssm'
|
||||
VpcId: !Ref VPC
|
||||
SecurityGroupIds: [ !Ref SecurityGroup ]
|
||||
SubnetIds: [ !Ref SubnetPrivate ]
|
||||
PrivateDnsEnabled: true
|
||||
|
||||
EcsTaskRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
AssumeRolePolicyDocument:
|
||||
Version: '2012-10-17'
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: ecs-tasks.amazonaws.com
|
||||
Action: sts:AssumeRole
|
||||
ManagedPolicyArns:
|
||||
- arn:aws:iam::aws:policy/AmazonEC2FullAccess
|
||||
- arn:aws:iam::aws:policy/AmazonS3FullAccess
|
||||
Policies:
|
||||
- PolicyName: !Sub "project-metrics-${AWS::Region}-${ProjectName}"
|
||||
PolicyDocument:
|
||||
Version: 2012-10-17
|
||||
Statement:
|
||||
- Action:
|
||||
- cloudwatch:PutMetricData
|
||||
Effect: Allow
|
||||
Resource: "*"
|
||||
- PolicyName: SatCompQueueExecutionRolePolicy
|
||||
PolicyDocument:
|
||||
Version: 2012-10-17
|
||||
Statement:
|
||||
- Action:
|
||||
- "sqs:GetQueueAttributes"
|
||||
- "sqs:SendMessage"
|
||||
- "sqs:ReceiveMessage"
|
||||
- "sqs:DeleteMessage"
|
||||
- "sqs:DeleteMessageBatch"
|
||||
- "sqs:GetQueueUrl"
|
||||
Effect: Allow
|
||||
Resource: "*"
|
||||
- PolicyName: SatCompDynamoExecutionRolePolicy
|
||||
PolicyDocument:
|
||||
Version: 2012-10-17
|
||||
Statement:
|
||||
- Action:
|
||||
- "dynamodb:UpdateItem"
|
||||
- "dynamodb:Scan"
|
||||
Effect: Allow
|
||||
Resource: "*"
|
||||
|
||||
SolverLeaderTaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
|
||||
TaskRoleArn: !GetAtt EcsTaskRole.Arn
|
||||
NetworkMode: awsvpc
|
||||
|
||||
RequiresCompatibilities:
|
||||
- EC2
|
||||
Volumes:
|
||||
- Name: SatVolume
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId: !Ref FileSystem
|
||||
TransitEncryption: ENABLED
|
||||
AuthorizationConfig:
|
||||
AccessPointId: !Ref AccessPoint
|
||||
|
||||
ContainerDefinitions:
|
||||
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:leader"
|
||||
Name: !Sub "${ProjectName}-leader"
|
||||
MemoryReservation: 61000
|
||||
Environment:
|
||||
- Name: SQS_QUEUE_NAME
|
||||
Value: !GetAtt SatCompQueue.QueueName
|
||||
- Name: SQS_OUTPUT_QUEUE_NAME
|
||||
Value: !GetAtt SatCompOutputQueue.QueueName
|
||||
- Name: AWS_DEFAULT_REGION
|
||||
Value: !Ref AWS::Region
|
||||
- Name: SATCOMP_BUCKET_NAME
|
||||
Value: !Ref SatCompBucket
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub "/ecs/${ProjectName}-leader"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
MountPoints:
|
||||
- ContainerPath: '/mount/efs'
|
||||
SourceVolume: SatVolume
|
||||
PortMappings:
|
||||
- HostPort: 22
|
||||
ContainerPort: 22
|
||||
Protocol: TCP
|
||||
SolverLogGroupLeader:
|
||||
Type: AWS::Logs::LogGroup
|
||||
Properties:
|
||||
LogGroupName: !Sub "/ecs/${ProjectName}-leader"
|
||||
RetentionInDays: 1827
|
||||
|
||||
SolverWorkerTaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
|
||||
TaskRoleArn: !GetAtt EcsTaskRole.Arn
|
||||
NetworkMode: awsvpc
|
||||
|
||||
RequiresCompatibilities:
|
||||
- EC2
|
||||
Volumes:
|
||||
- Name: SatVolume
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId: !Ref FileSystem
|
||||
TransitEncryption: ENABLED
|
||||
AuthorizationConfig:
|
||||
AccessPointId: !Ref AccessPoint
|
||||
|
||||
ContainerDefinitions:
|
||||
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:worker"
|
||||
Name: !Sub "${ProjectName}-worker"
|
||||
MemoryReservation: 61000
|
||||
Environment:
|
||||
- Name: SQS_QUEUE_NAME
|
||||
Value: !GetAtt SatCompQueue.QueueName
|
||||
- Name: SQS_OUTPUT_QUEUE_NAME
|
||||
Value: !GetAtt SatCompOutputQueue.QueueName
|
||||
- Name: AWS_DEFAULT_REGION
|
||||
Value: !Ref AWS::Region
|
||||
- Name: SATCOMP_BUCKET_NAME
|
||||
Value: !Ref SatCompBucket
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub "/ecs/${ProjectName}-worker"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
MountPoints:
|
||||
- ContainerPath: '/mount/efs'
|
||||
SourceVolume: SatVolume
|
||||
PortMappings:
|
||||
- HostPort: 22
|
||||
ContainerPort: 22
|
||||
Protocol: TCP
|
||||
SolverLogGroupWorker:
|
||||
Type: AWS::Logs::LogGroup
|
||||
Properties:
|
||||
LogGroupName: !Sub "/ecs/${ProjectName}-worker"
|
||||
RetentionInDays: 1827
|
||||
|
||||
ECSTaskExecutionRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
AssumeRolePolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: [ ecs-tasks.amazonaws.com ]
|
||||
Action: [ 'sts:AssumeRole' ]
|
||||
Path: /
|
||||
Policies:
|
||||
- PolicyName: AmazonECSTaskExecutionRolePolicy
|
||||
PolicyDocument:
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Action:
|
||||
# Allow the ECS Tasks to download images from ECR
|
||||
- 'ecr:GetAuthorizationToken'
|
||||
- 'ecr:BatchCheckLayerAvailability'
|
||||
- 'ecr:GetDownloadUrlForLayer'
|
||||
- 'ecr:BatchGetImage'
|
||||
|
||||
# Allow the ECS tasks to upload logs to CloudWatch
|
||||
- 'logs:CreateLogStream'
|
||||
- 'logs:PutLogEvents'
|
||||
Resource: '*'
|
||||
|
||||
SolverLeaderService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Ref: EcsCluster
|
||||
DesiredCount: 0
|
||||
LaunchType: EC2
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
Subnets:
|
||||
- !Ref SubnetPrivate
|
||||
TaskDefinition:
|
||||
Ref: SolverLeaderTaskDefinition
|
||||
|
||||
SolverWorkerService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Ref: EcsCluster
|
||||
DesiredCount: 0
|
||||
LaunchType: EC2
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
Subnets:
|
||||
- !Ref SubnetPrivate
|
||||
TaskDefinition:
|
||||
Ref: SolverWorkerTaskDefinition
|
||||
|
||||
|
||||
AccessPoint:
|
||||
Type: AWS::EFS::AccessPoint
|
||||
Properties:
|
||||
FileSystemId: !Ref FileSystem
|
||||
PosixUser:
|
||||
Uid: "1001"
|
||||
Gid: "1001"
|
||||
RootDirectory:
|
||||
CreationInfo:
|
||||
OwnerGid: "1001"
|
||||
OwnerUid: "1001"
|
||||
Permissions: "0755"
|
||||
Path: "/sat-comp"
|
||||
|
||||
FileSystem:
|
||||
Type: AWS::EFS::FileSystem
|
||||
Properties:
|
||||
PerformanceMode: generalPurpose
|
||||
|
||||
EFSMountTarget:
|
||||
Type: AWS::EFS::MountTarget
|
||||
Properties:
|
||||
FileSystemId: !Ref FileSystem
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
SubnetId: !Ref SubnetPrivate
|
||||
SatCompQueue:
|
||||
Type: AWS::SQS::Queue
|
||||
Properties:
|
||||
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompQueue"
|
||||
VisibilityTimeout: 5
|
||||
ReceiveMessageWaitTimeSeconds: 20
|
||||
KmsDataKeyReusePeriodSeconds: 3600
|
||||
KmsMasterKeyId: "alias/aws/sqs"
|
||||
SatCompOutputQueue:
|
||||
Type: AWS::SQS::Queue
|
||||
Properties:
|
||||
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompOutputQueue"
|
||||
VisibilityTimeout: 5
|
||||
ReceiveMessageWaitTimeSeconds: 20
|
||||
KmsDataKeyReusePeriodSeconds: 3600
|
||||
KmsMasterKeyId: "alias/aws/sqs"
|
||||
|
||||
NodeManifest:
|
||||
Type: AWS::DynamoDB::Table
|
||||
Properties:
|
||||
AttributeDefinitions:
|
||||
-
|
||||
AttributeName: nodeId
|
||||
AttributeType: S
|
||||
BillingMode: PAY_PER_REQUEST
|
||||
KeySchema:
|
||||
-
|
||||
AttributeName: nodeId
|
||||
KeyType: HASH
|
||||
TableName: SatCompNodeManifest
|
||||
|
||||
TaskEndNotification:
|
||||
Type: AWS::DynamoDB::Table
|
||||
Properties:
|
||||
AttributeDefinitions:
|
||||
-
|
||||
AttributeName: leaderIp
|
||||
AttributeType: S
|
||||
BillingMode: PAY_PER_REQUEST
|
||||
KeySchema:
|
||||
-
|
||||
AttributeName: leaderIp
|
||||
KeyType: HASH
|
||||
TableName: TaskEndNotification
|
||||
|
||||
SolverLeaderEcr:
|
||||
Type: AWS::ECR::Repository
|
||||
Properties:
|
||||
RepositoryName: !Sub "${ProjectName}"
|
||||
LifecyclePolicy:
|
||||
LifecyclePolicyText: '
|
||||
{
|
||||
"rules": [ {
|
||||
"rulePriority": 10,
|
||||
"description": "remove untagged images except the latest one",
|
||||
"selection": {
|
||||
"tagStatus": "untagged",
|
||||
"countType": "imageCountMoreThan",
|
||||
"countNumber": 1
|
||||
},
|
||||
"action": {
|
||||
"type": "expire"
|
||||
}
|
||||
} ]
|
||||
}'
|
||||
|
||||
# SolverWorkerEcr:
|
||||
# Type: AWS::ECR::Repository
|
||||
# Properties:
|
||||
# RepositoryName: !Sub "${ProjectName}-worker"
|
||||
# LifecyclePolicy:
|
||||
# LifecyclePolicyText: '
|
||||
# {
|
||||
# "rules": [ {
|
||||
# "rulePriority": 10,
|
||||
# "description": "remove untagged images except the latest one",
|
||||
# "selection": {
|
||||
# "tagStatus": "untagged",
|
||||
# "countType": "imageCountMoreThan",
|
||||
# "countNumber": 1
|
||||
# },
|
||||
# "action": {
|
||||
# "type": "expire"
|
||||
# }
|
||||
# } ]
|
||||
# }'
|
||||
|
||||
SatCompBucket:
|
||||
Type: AWS::S3::Bucket
|
||||
Properties:
|
||||
BucketName: !Sub "${AWS::AccountId}-${AWS::Region}-${ProjectName}"
|
||||
BucketEncryption:
|
||||
ServerSideEncryptionConfiguration:
|
||||
- ServerSideEncryptionByDefault:
|
||||
SSEAlgorithm: AES256
|
||||
PublicAccessBlockConfiguration:
|
||||
BlockPublicAcls: true
|
||||
BlockPublicPolicy: true
|
||||
IgnorePublicAcls: true
|
||||
RestrictPublicBuckets: true
|
||||
|
||||
|
||||
Outputs:
|
||||
Subnet:
|
||||
Value:
|
||||
Ref: SubnetPrivate
|
||||
Export:
|
||||
Name: SubnetId
|
||||
SecurityGroupId:
|
||||
Value:
|
||||
Ref: SecurityGroup
|
||||
Export:
|
||||
Name: SecurityGroup
|
||||
|
@ -0,0 +1,4 @@
|
||||
c
|
||||
p cnf 3 2
|
||||
1 -3 0
|
||||
2 3 -1 0
|
@ -0,0 +1,2 @@
|
||||
#!/bin/sh
|
||||
python3 ./manage-solver-infrastructure.py --mode update $*
|