增加 aws 示例

This commit is contained in:
YuhangQ 2023-03-28 08:48:09 +00:00
parent b1e7c36187
commit 2a2a417bbb
92 changed files with 5066 additions and 0 deletions

View File

@ -0,0 +1,3 @@
# Default ignored files
/workspace.xml

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="pytest" />
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/aws-batch-comp-infrastructure-sample.iml" filepath="$PROJECT_DIR$/.idea/aws-batch-comp-infrastructure-sample.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,4 @@
## Code of Conduct
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
opensource-codeofconduct@amazon.com with any additional questions or comments.

View File

@ -0,0 +1,61 @@
# Contributing Guidelines
Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
documentation, we greatly value feedback and contributions from our community.
Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
information to effectively respond to your bug report or contribution.
## Reporting Bugs/Feature Requests
We welcome you to use the GitHub issue tracker to report bugs or suggest features.
When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
* A reproducible test case or series of steps
* The version of our code being used
* Any modifications you've made relevant to the bug
* Anything unusual about your environment or deployment
## Contributing via Pull Requests
Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
1. You are working against the latest source on the *master* branch.
2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
To send us a pull request, please:
1. Fork the repository.
2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
3. Ensure local tests pass.
4. Commit to your fork using clear commit messages.
5. Send us a pull request, answering any default questions in the pull request interface.
6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
## Finding contributions to work on
Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
## Code of Conduct
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
opensource-codeofconduct@amazon.com with any additional questions or comments.
## Security issue notifications
If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
## Licensing
See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.

View File

@ -0,0 +1,15 @@
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,75 @@
# Changes from 2022 SAT/SMT-Comp Interactions
The fundamental architecture for the SAT/SMT competition is unchanged from 2022. However, based on feedback from competitors in 2022, we have made some small changes to how the infrastructure interacts with the solvers provided by each team through the `input.json` file and `solver-out.json` files.
## Changes to input.json
We have enriched the `input.json` file to allow solvers to be invoked with different arguments / solver names to allow testing of different strategies without rebuilding the container, and to specify a timeout for the solver.
The 2022 format for input.json was as follows:
```text
{
"problem_path": "/mount/efs/problem.cnf",
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
}
```
In 2023, we have expanded this format to:
```text
{
"formula_file": "/mount/efs/problem.cnf",
"formula_language": "DIMACS",
"solver_argument_list": ["-p", "another_arg"],
"timeout_seconds": 10,
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
}
```
where:
* `formula_file` is the path to the SAT/SMT problem to be solved.
* `formula_language` is the encoding of the problem (currently we use DIMACS for SAT-Comp and SMTLIB2 for SMT-Comp). This field is optional and can be ignored by the solver.
* `solver_argument_list` allows passthrough of arguments to the solver. This allows you to try different strategies without rebuilding your docker container by varying the arguments.
* `timeout_seconds` is the timeout for the solver. It will be enforced by the infrastructure; a solver that doesn't complete within the timeout will be terminated.
* `worker_node_ips` is unchanged; for cloud solvers, it is the list of worker nodes. For parallel solvers this field will always be the empty list.
The executable / python file that starts the solver should be updated to support this new format.
## Changes to solver_out.json
The 2022 format for solver_out.json was as follows:
```text
{
"return_code": Number, // of running the solve command
"result": String, // Should be one of {"SAT", "UNSAT", "UNKNOWN", "ERROR"}
"artifacts": {
"stdout_path": String, // Where to find the stdout of the solve run
"stderr_path": String // Where to find the stderr of the solve run
}
}
```
In 2023, we have modified the format to:
```text
{
"return_code": int, // The return code for the solver.
"artifacts": {
"stdout_path": String, // Where to find the stdout of the solve run
"stderr_path": String // Where to find the stderr of the solve run
}
}
```
where:
* `return_code` is the return code for the solver. **N.B.**: Unlike the 2022 format, the return_code in `solver_out.json` will determine the satisfiability of the problem for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
* `stdout_path` is the path to the stdout log.
* `stderr_path` is the path to the stderr log.
The Mallob example solver has been updated to support the new format, so you can examine these changes by examining the leader solver python file [here](docker/mallob-images/leader/solver). This file is compatible with the new format, but does not use the `formula_language`, `timeout_seconds` or `solver_argument_list` fields.
Other than these changes, solvers should be able to be run unchanged on the 2023 architecture. Much more information on solver development is available in the [Solver Development README](docker/README-Solver-Development.md).
If you have problems please contact us at either sat-comp@amazon.com (for SAT-Comp) or aws-smtcomp-2023@googlegroups.com (for SMT-Comp).

View File

@ -0,0 +1,82 @@
# SAT-Comp and SMT-Comp Cloud Track Instructions
This repository will help you get your parallel or distributed solver running efficiently on AWS. You will build docker containers with your solver and then connect them to the AWS infrastructure.
We recommend that you work in four steps:
1. Create and configure an AWS Account for the competition. Please do this right away and send us an email, so we can start the process to give you AWS credits. You can then continue with step 2 while waiting for us to answer.
2. Build your solver as a Docker image and run experiments locally.
3. Set up AWS infrastructure and test your solver on the cloud.
4. When ready, share the solver repository and Docker image with us.
We describe step 1 in this document. Step 2 is described in the [Solver Development README](docker/README-Solver-Development.md), while steps 3 and 4 are described in the [Infrastructure README](infrastructure/README-Infrastructure.md).
## Creating an AWS Account
First, create a specific AWS account for the competition. If you have not created an AWS account previously, it is straightforward to do, requiring a cell phone number, credit card, and address. Make sure to register your account with an institutional email address (and not a private one), otherwise AWS cannot sponsor your account. To create an account, navigate to [aws.amazon.com](https://aws.amazon.com) and follow the instructions.
If you have previously created an AWS account, we strongly advise that you create a separate AWS account for managing the SAT/SMT-Comp tool construction and testing. This makes it much easier for us to manage account credits and billing. Once the new account is created, email us the account number at: sat-comp@amazon.com (for SAT-Comp) or aws-smtcomp-2023@googlegroups.com (for SMT-Comp) and we will apply the appropriate credits to your account.
To find your account ID, click on your account name in the top right corner, and then click "My Account". You should see Account ID in the Account Settings
It is important that you tell us your account number immediately after creating the account, so that we can assign you a resource budget for your experiments. We also need to grant you access to the shared problem sets. Once we hear from you, we will email you an acknowledgment when resources have been added to your account.
## Building Your Solver
Next, it is time to develop your solver! All of the development and most of the testing can be performed on a local laptop, so it is not necessary to wait for AWS credits to get started. Please look at the instructions in the [Solver Development README](docker/README-Solver-Development.md) on how to start building and testing your solver.
## For Returning Competitors:
We have worked hard to improve the developer experience. As compared with the 2022 infrastructure, we have added or modified:
1. Scripts and instructions for local debugging of both parallel and distributed solvers on your workstation/laptop.
1. Automated upload of stdout/stderr and intermediate files for each run to S3 storage so that you can debug runs after completion.
1. A "quickstart" that allows you to build the infrastructure and solver, then run it using only two commands.
1. Default values for most of the required arguments from 2022 to make it much simpler to run commands.
1. A richer `input.json` format that is used for solver communication that allows passing arguments to solvers, a timeout, and information about the input format.
1. A change in the expected solver output format in `solver_out.json`. This year, the satisfiability/unsatisfiability of the problem is indicated by the return code of the solver (10 for SAT, 20 for UNSAT)
1. Various improvements and bug fixes in the scripts for building/running jobs to provide better observability of the process.
Note that the changes to `input.json` and `solver_out.json` are breaking changes for solvers, described in more detail in [README-changes.md](README-changes.md). The changes should require only a short effort to update for 2023.
## Additional Resources: Analysis Problems
The SAT problems for the SAT-Comp 2022 competition are available [here](https://satcompetition.github.io/2022/downloads.html). The SMT problems for SMT-Comp 2022 competition are available [here](https://smt-comp.github.io/2022/benchmarks.html).
## Additional Resources: Solvers
Here are github repositories for the solvers from the 2022 competitions. **Please Note:** the
infrastructure for 2023 is slightly changed to facilitate better debugging and easier build. In order to run these solvers on the current infrastructure, you must update `input.json` and `solver_out.json` as described in [README-changes.md](README-changes.md).
SAT-Comp Parallel:
* [ParKissat-RS](https://github.com/mww-aws/ParKissat/tree/RS)
* [ParKissat-PRE](https://github.com/mww-aws/ParKissat/tree/PRE)
* [PaKis22](https://github.com/KTRDeveloper/PaKis22)
* [PaKisMAB22](https://github.com/KTRDeveloper/PaKisMAB22)
* [DPS-Kissat](https://github.com/nabesima/DPS-satcomp2022)
* [NPS-Kissat](https://github.com/nabesima/DPS-satcomp2022/tree/non-det)
* [P-Kissat](https://github.com/vvallade/painless-sat-competition-2022/tree/pkissat)
* [P-MCOMSPS](https://github.com/vvallade/painless-sat-competition-2022)
* [Mallob-ki](https://github.com/domschrei/isc22-mallob/tree/ki)
* [gimsatul](https://github.com/arminbiere/gimsatul)
SAT-Comp Cloud:
* [Paracooba](https://github.com/maximaximal/paracooba-satcomp22)
* [Mallob-kicaliglu](https://github.com/domschrei/isc22-mallob/tree/kicaliglu)
SMT-Comp Parallel:
* [SMTS Cube and Conquer](https://github.com/usi-verification-and-security/aws-smts/tree/parallel-cube-and-conquer-fixed)
* [SMTS Portfolio](https://github.com/usi-verification-and-security/aws-smts/tree/parallel-portfolio)
* [Vampire](https://github.com/vprover/vampire/tree/smtcomp22)
SMT-Comp Cloud:
* [cvc5-cloud](https://github.com/amaleewilson/aws-satcomp-solver-sample/tree/cvc5)
* [SMTS Cube and Conquer](https://github.com/usi-verification-and-security/aws-smts/tree/cloud-cube-and-conquer-fixed)
* [SMTS Portfolio](https://github.com/usi-verification-and-security/aws-smts/tree/cloud-portfolio)
* [Vampire](https://github.com/vprover/vampire/tree/smtcomp22)
## FAQ
#### I already created my AWS account with a non-institutional email address. Can I still change the email address tied to my account?
Yes. To change your email address, follow the instructions at https://repost.aws/knowledge-center/change-email-address.

View File

@ -0,0 +1,4 @@
Release Notes:
==============
03/10/2022: Version 2. Updated to use new base containers and run in ECS.
02/25/2020: Version 1. Sample code documenting how to run SAT solvers in AWS Batch.

View File

@ -0,0 +1,328 @@
# Solver Development for SAT/SMT-Comp
This README covers the process of building your solver and embedding it in docker images. We first show the steps to build and run an example solver, Mallob, then describe how to build your own solver within Docker and test it on your local machine.
## Prerequisites
Platforms Amazon Linux 2 (AL2), Ubuntu 20, and Mac OS Monterey (v12.6) have been tested successfully. Builds on other platforms may work, but have not been tested.
To build and run solvers, you will need the following tools installed:
- [python3](https://www.python.org/). To install the latest version for your platform, go to the [downloads](https://www.python.org/downloads/) page.
- [docker](https://www.docker.com/). There is a button to download Docker Desktop on the main page.
Basic familiarity with Docker will be helpful, but we will walk you through step-by-step. If you need more information, there are a number of excellent tutorials, such as [this](https://docs.docker.com/get-started/).
# Example: Mallob
This example covers two configurations: parallel (single node, multiple cores) and distributed (multiple nodes, multiple cores per node). Because building a distributed solver is a superset of of building a parallel solver; we will note steps that are only required for a distributed solver.
We use [Mallob](https://github.com/domschrei/mallob) as the running example for both parallel and distributed solvers. This example pulls a version of Mallob that we have tested with the AWS infrastructure. Note that although this repository is released under the MIT-0 license, the Dockerfiles use the Mallob project. Mallob and the solvers it uses have other licenses, including the [LGPL 3.0](https://opensource.org/license/lgpl-3-0/) license.
This section proceeds in three steps:
- Building base SATcomp Docker images
- Building Mallob Docker images
- Running Mallob in Docker
## Building the SATComp Base Images
To simplify the solver construction process, we provide base Docker images that manage the infrastructure necessary for solvers as well as access to AWS resources. We will build three images, a common image, a leader, and a worker. One leader coordinates multiple workers. The Dockerfiles and needed resources are contained in the [satcomp-images](satcomp-images) directory.
To begin, navigate to the `satcomp-images` directory and execute the `build_satcomp_images.sh` script. This script will build three images: `satcomp-infrastructure:common`, `satcomp-infrastructure:leader`, and `satcomp-infrastructure:worker` that correspond to the common, leader, and worker images.
### Checking Docker Build Results
After buliding the docker images, check to make sure the images have built successfully:
1. Run `docker image ls` or `docker images`
2. Make sure that you see `satcomp-infrastructure:common`, `satcomp-infrastructure:leader`, and `satcomp-infrastructure:worker` in the list of images.
You should get a response similar to
% docker image ls
REPOSITORY TAG IMAGE ID CREATED SIZE
satcomp-infrastructure worker 83e1a25f57a9 5 minutes ago 819MB
satcomp-infrastructure leader 766b446bd057 5 minutes ago 817MB
satcomp-infrastructure common 51da12f359f8 6 minutes ago 725MB
ubuntu 20.04 e40cf56b4be3 7 days ago 72.8MB
%
## Building Mallob Images
To build the mallob distributed solver images, we will use the satcomp infrastructure worker and leader images built previously. To begin, cd into the `mallob-images` directory, which contains the needed Dockerfiles and other infrastructure.
To build all three docker images in one step, execute the `build_mallob_images.sh` script. You will want to create a similar script for your solver. More information on the script and how to build each docker image individually is available in the Q&A section of the document.
After building the mallob images, run `docker image ls` and make sure you see both `satcomp-mallob:leader` and `satcomp-mallob:worker` in the list of images.
This is what `docker image ls` shows for the tutorial to this point:
% docker image ls
REPOSITORY TAG IMAGE ID CREATED SIZE
satcomp-mallob worker f3a2276355eb 2 minutes ago 916MB
satcomp-mallob leader c3045f85f3bc 2 minutes ago 914MB
satcomp-mallob common 4bcf7c5a156e 2 minutes ago 1.58GB
satcomp-infrastructure worker dad677687909 11 minutes ago 819MB
satcomp-infrastructure leader c439dfb34537 12 minutes ago 817MB
satcomp-infrastructure common 3b2935f84a7c 12 minutes ago 725MB
ubuntu 20.04 e40cf56b4be3 2 weeks ago 72.8MB
## Running Mallob
We will use `docker run` to create docker _containers_ (processes) as instances of the docker _images_ we created in the previous section.
To find out which docker containers are running, use the command `docker ps`. The command `docker ps -a` will include containers that have already exited. To remove a container, run `docker rm <CONTAINER_ID>`. To remove all containers, run `docker rm $(docker ps -aq)`.
### Creating a Docker Network
Before running mallob we need to create a docker bridge network that our containers can attach to. This is necessary for both parallel and distributed mallob. Create a network named `mallob-test` by running the command `docker network create mallob-test`.
### Running Parallel Mallob
To run parallel Mallob, navigate to the `runner` directory. We have created a simple shell script called `run_parallel.sh`, which you will use to run the mallob_parallel docker image, starting a container and running a SAT/SMT problem in the container. The script has two variables that can be configured if you wish (described in Q&A) but are set to sensible defaults.
**N.B.:** Because the docker image runs as a different user and group than the local host, you need to set the directory permissions so that Docker image can read and write to the directory. Please run: `sudo chgrp -R 1000 . && chmod 775 .` from the `docker/runner` directory so that the container can access this portion of the filesystem..
The `run_parallel.sh` script requires two command-line arguments.
- <docker_image_name>, which is `satcomp-mallob` for this example.
- <query_file>, which is the name of the test file for the solver. If you use the defaults, you should put SAT/SMT files in the docker/runner/experiment subdirectory, and if you run the script from the `runner` directory, then you can use standard shell completion for paths.
To run the script with the `test.cnf` file we provided, call `run_parallel.sh satcomp-mallob experiment/test.cnf` from within the `runner` directory. After creating a container, the script will drop you into a bash shell for this container.
The script will create an `input.json` file in the host run directory. This file will be copied to the docker run directory `/rundir`, where it will be read by the solver script.
The script comments explain the various arguments to the `docker run` command. The `docker run` invocation uses bash as the docker entrypoint and passes `init_solver.sh` as a command to bash. The initialization script starts sshd and then runs `/competition/solver /rundir`, which will execute the solver on the query in `<query_file>`. At this point, you should see mallob STDOUT and STDERR on the terminal as mallob runs the solver query.
### Running Distributed Mallob
Running distributed mallob requires two docker invocations running in two different terminal windows: one to start a leader container and one to start a worker container.
To run distributed Mallob, again cd into the `runner` directory. You will use two shell scripts, `run_dist_worker.sh` and `run_dist_leader.sh`.
- Step 1. Invoke `run_dist_worker.sh`, which requires a single command-line argument <docker_image_name>, which is `satcomp-mallob` for this example. Notice that the script will launch a `satcomp-mallob:worker` container. You will be dropped into a bash shell for the worker container. No further commands are needed.
- Step 2. From a different terminal on the host machine (in the same `runner` directory), invoke `run_dist_leader.sh` This script requires the same two command-line arguments as `run_parallel.sh` in the previous section. For example, you can call `run_dist_leader.sh satcomp-mallob experiment/test.cnf`
The `run_dist_leader` script will again create an `input.json` file, with more fields than used for parallel mallob. Again, you should see mallob output on the terminal.
### Debugging
After the solver query runs, you will be left in a bash shell prompt that is executing in the docker leader container. At that point, you can explore. You will see the `/competition` directory with the solver scripts. You will also see the `/rundir` directory that includes your test file, the solver input file `input.json`, and three output files:
If mallob ran successfully, you will find three new files in `/rundir`:
- `solver_out.json`
- `stdout.log`
- `stderr.log`
Again, note that the container directory `/rundir` is the mount point for the host run directory as specified by the script variable `HOST_RUNDIR`. The files in `/rundir` will persist on the host filesystem, even after the docker container is stopped or removed.
At this point, you can perform additional experiments or exit the docker shell.
The competition infrastructure starts solver containers and keeps them running for multiple queries. Each query will have a new `input.json` file, and `/container/solver` will be run again.
**N.B.:** When debugging your own solver, the key step (other than making sure your solver ran correctly) is to ensure that you clean up resources between runs of the solver. You should ensure that no solver processes are running and any temporary files are removed between executions. During the competition, the docker images will be left running throughout and each SAT/SMT problem will be injected into the running container. You are responsible for cleaning up files and processes created by your solver. In the case of Mallob, it performs the cleanup of temporary files for the leader when the solver starts (rather than when it finishes), so that you can inspect them after the solver completes execution.
To check for orphaned jobs, use the `ps -ax` in both the leader and worker containers. This should show you all running processes. Make sure there aren't any stray processes that continue execution. In addition, check all the locations in the container where your solver places temporary files in order to make sure that they are removed after the run.
If your solver doesn't run correctly in the docker container, you can remove the `/container/solver` commands from the `init_solver.sh` files. Once you are dropped into the docker container's bash shell, you can explore and debug directly, including running `/container/solver /rundir` from the container shell command line.
We have now run, debugged, and inspected a sample solver. It is a good idea to try out multiple files, inspect the results, and try running in both cloud and parallel configurations. Once you are comfortable with these interactions, it is time to start working on your own solver.
# Preparing Your Own Solver Images
In the previous section, we walked through building and running a sample solver in both cloud and parallel configurations. We assume at this point that you know how to build with Docker and how to run it locally using the scripts that we have provided. We recommend that you run through the Mallob example before you start building your own images, but of course you can start here and then follow the same steps as described in "Running Mallob" using your own images.
In this section, we'll talk about how to build your own solvers into Docker containers that are derived from the base containers that we provide. All of the interaction with AWS services for deployment on the cloud is managed by the Docker base images. We provide two base images: one for leader nodes and one for worker nodes (N.B.: workers are only required for the cloud track). The Leader Node is responsible for collecting IP addresses of available worker nodes before starting the solving process, pulling work from an SQS queue, downloading problem instances from S3, and sharing and coordinating with Worker Nodes. The Worker Node base container is responsible for reporting its status and IP address to the Leader Node. As these base images manage the interaction with AWS, you can build and test your solvers locally, and they should work the same way when deployed on the cloud.
## Building from Competition Base Containers
Your solver must be buildable from source code using a standard process. We use Docker to create a build process that can easily be standardized. Docker removes many of the platform-specific problems with building under one operating system and running in another.
You will provide a GitHub repo with a Dockerfile in the top directory that we will use to build your solver. This first section of this README constructed just such a solver. In this guide, we start by building and running the solver locally, then once it is working properly, we will use the [Infrastructure README](../infrastructure/README-Infrastructure.md) to test it using AWS resources. We will use the same process to host your solver during the competition.
For the cloud track, you will need to supply two Dockerfiles for your solver, one that acts as a Leader Node and one that acts as a Worker Node. In the case of a parallel solver, you only need to provide a single image, which we will also call a Leader Node for convenience.
The Docker base images are contained in the `satcomp-images` directory. As you follow this example, you should also consult the `mallob-images` directory to see how we based the Mallob examples on the images in `satcomp-images`.
You should begin your Dockerfiles with:
```text
FROM satcomp-infrastructure:leader
```
or
```text
FROM satcomp-infrastructure:worker
```
The base images (which are based on ubuntu:20.04) have predefined entrypoints that will start when the container is invoked. These entrypoints handle all interaction with the competition infrastructure. As a result, you are only responsible for invoking your solver; we expect that you will not have to write code for most of the interactions with AWS resources.
### Leader Base Container
The Leader Base Container sets up the SAT/SMT problem for the solver. It loads the SAT/SMT problem, then attempts to run an executable file in the docker image in the `/competition/solver` directory. It provides a single argument to this file, which is a directory path containing a file called `input.json`.
Here is an example of the `input.json` file for cloud mallob:
```text
{
"formula_file": "/mount/efs/problem.cnf",
"formula_language": "DIMACS",
"solver_argument_list": ["-p", "another_arg"],
"timeout_seconds": 10,
"worker_node_ips": ["192.158.1.38", "192.158.2.39", ...]
}
```
where:
* `formula_file` is the path to the SAT/SMT problem to be solved.
* `formula_language` is the encoding of the problem (currently we use DIMACS for SAT-Comp and SMTLIB2 for SMT-Comp). This field is optional and can be ignored by the solver.
* `solver_argument_list` allows passthrough of arguments to the solver. This allows you to try different strategies without rebuilding your docker container by varying the arguments.
* `timeout_seconds` is the timeout for the solver. It will be enforced by the infrastructure; a solver that doesn't complete within the timeout will be terminated.
* `worker_node_ips` is unchanged; for cloud solvers, it is the list of worker nodes. For parallel solvers this field will always be the empty list.
For the interactive example in the first section, this file is generated by the [`run_parallel.sh`](runner/run_parallel.sh) script.
The solver is then expected to run and generate a file called `solver_out.json`.
Here is an example of `solver_out.json` from distributed mallob:
```text
{
"return_code": int, // The return code for the solver.
"artifacts": {
"stdout_path": String, // Where to find the stdout of the solve run
"stderr_path": String // Where to find the stderr of the solve run
}
}
```
where:
* `return_code` is the return code for the solver. **N.B.**: The return_code in `solver_out.json` will determine the satisfiability of the problem for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
* `stdout_path` is the path to the stdout log.
* `stderr_path` is the path to the stderr log.
You will adapt the Mallob `solver` script to invoke your solver and coordinate with Worker nodes. You should extend the competition base image by copying your `solver` script to a new container that is based on the Competition Base Container.
For example:
```text
FROM satcomp-base:leader
# Do all steps necessary to build your solver
RUN apt-get install my-amazing-solver
# Add your solver script
COPY solver /competition/solver
RUN chmod +x /competition/solver
```
Consult the Mallob [leader Dockerfile](mallob-images/leader/Dockerfile) for a more complete example. Note that you should not provide a docker entrypoint or a CMD because the entrypoint is specified by the base container.
### Worker Base Container
Workers are expected to be controllable by the leader node using `ssh`. For workers, competitors have three responsibilities:
1. To report their status to the AWS infrastructure once per second, to determine whether work can be allocated to them.
2. To clean up when the leader completes a problem (or terminates with an error).
3. To assist with the solving process using ssh.
#### Reporting Status
To accomplish the first task, the infrastructure assumes that there is an executable at the location `/competition/worker` that will report status once per second. Status reporting is done by updating a file at location `/competition/worker_node_status.json`. The file contains a current status (one of {READY, BUSY, ERROR}), and a timestamp. The timestamp is the unix epoch time as returned by the linux time() function. If the worker fails to update this status file for more than 5 seconds, the worker node will be considered failed and the infrastructure will reset the node. Repeated failures of worker nodes (likely to be a maximum of three) will cause the system to score the problem as an 'error'.
Here is an example of the format for `worker_node_status.json`:
```text
{
"status": "READY", // one of {"READY", "BUSY", "ERROR"}
"timestamp": "1644545117" // linux epoch time as returned by the C time() function
}
```
The Mallob worker script is a 'bare-bones' example of status reporting that simply checks the status of the worker process and sshd. See the [worker](mallob-images/worker/worker) for details.
#### Cleaning up
You are also responsible for writing a worker `cleanup` script that is executed when a solving task is complete. The infrastructure will ensure that this file is called after the leader completes on a SAT/SMT problem, or after the leader fails in some way. We add this capability to make it straightforward to kill solver nodes that are still working on subproblems when the leader node completes. Given that the leader can also communicate with the worker using SSH, this file is not strictly necessary depending on how the system is architected (i.e., if the leader can gracefully and quickly terminate all workers at the end of solving, and the leader itself does not fail).
In normal operation, this script is executed by the infrastructure when the leader process completes. It will also be executed when a failure occurs. The script must reset the worker state (the node will remain) so that it is ready to solve the next problem.
For example:
```text
FROM satcomp-base:worker
# Do all steps necessary to build your solver
RUN apt-get install my-amazing-solver
# Add your solver script
COPY worker /competition/worker
RUN chmod +x /competition/worker
COPY cleanup /competition/cleanup
RUN chmod +x /competition/cleanup
```
See the Mallob worker [Dockerfile](satcomp-images/satcomp-worker/Dockerfile) for more details.
#### Assisting with the Solving Process
There are many ways to accomplish this goal, and to some degree this is the critical part of building a distributed solver. In past years, many competitors have used [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) to structure the communications between the leader and worker nodes (Mallob is one example that uses MPI), but competitors have the freedom to structure communication in whatever way makes the most sense for their solver. Worker nodes are not passed a representation of the SAT/SMT problem by the infrastructure: instead the leader must communicate the problem (or a portion of it) to the worker.
## Next step: Building / Running on AWS Infrastructure
After building your solvers, please test them using Docker, as we did with Mallob in this README. Once you have built your solver and you are confident that it works properly using Docker, then you are ready to try it out at larger scale on AWS infrastructure. Next, please navigate to [Infrastructure Readme](../infrastructure/README-Infrastructure.md), where the process of uploading, running, and debugging your docker images on AWS is described.
## FAQ / Troubleshooting
### Q: What is the build_mallob_images.sh script doing? What do the flags for docker build mean?
The script looks like this:
# /bin/sh
cd common
docker build -t satcomp-mallob:common .
cd ..
cd leader
docker build -t satcomp-mallob:leader .
cd ..
cd worker
docker build -t satcomp-mallob:worker .
cd ..
First we factor out the common parts of the leader and worker into a common image. This Dockerfile builds the Mallob solver and its components that will be used by both the leader and the worker image. Then we use this common image to build the leader and worker images.
Docker has two characteristics for build that are important for this script. The `-t satcomp-mallob:common` argument tells Docker to give the newly constructed image a _tag_ that provides a name for the docker image. The second thing is that all file paths in Docker are local to the directory where the `docker build` command occurs. This is why we need to navigate to the directory containing the Dockerfile prior to running the build.
Note that there is also a `nocache_build_mallob_images.sh` script. Although Docker does a pretty good job of dependency tracking, certain changes are invisible to it. For example, Docker can determine whether there has been an update to the Mallob repository, but if Mallob calls out to another repository, then Docker will not detect it, and you might get a stale build. To guard against this, there is a `--no-cache` flag that will do a full rebuild. If your solver has similar indirect dependencies, you might need to use the `--no-cache` flag.
#### To build the Mallob leader image:
1. Navigate to the `leader` subdirectory.
2. Run `docker build -t satcomp-mallob:leader .` The resulting image will be named `satcomp-mallob:leader`. This Dockerfile adds scripts necessary to use Mallob as the leader node in a distributed solver to the generated image.
#### To build the Mallob worker image:
1. Navigate to the `worker` subdirectory.
2. Run `docker build -t satcomp-mallob:worker .` This Dockerfile adds scripts necessary to use Mallob as the worker node in a distributed solver to the generated image.
### Q: Suppose I want to change the directory or network name for the run_parallel and run_cloud scripts. How do I do this?
A: The two variables to change are:
- `DOCKER_NETWORK`. Name of the docker bridge network. The default is `mallob-test`, which will work with the `network create` command in the previous section.
- `HOST_RUNDIR`. Name of the host directory that will be mounted in the docker run directory. _Note: This should be an absolute pathname and it must be updated for your filesystem._ This directory will be mounted in the docker container and enables you to persist information over multiple docker container sessions. We have placed an `experiment` directory inside `runner` that you can use to begin.
These are both documented in the script files.
### Q: After a while, I have lots of old versions of Docker images that are taking up disk space that I no longer need. How do I get rid of them?
A: On repeated image builds, previously-built images with the same name will be left with the name/tag as `<none>/<none>`. Docker dangling images can be deleted with `docker image prune`. A specific image can be deleted by running `docker rmi <IMAGE ID>`.
Note that you can delete all docker images on your machine by running `docker rmi -f $(docker images -a -q)`. Be careful: only do this after running `docker images` to check that you won't delete images unrelated to the solver competition.
### Q: I'm only submitting to the parallel track and not the cloud track. Do I need a worker image?
A: No. If you are submitting to the parallel track only, you do not need a worker image. For the parallel track, we will assume that the leader manages all threading and communications within the single (multi-core) compute node.

View File

@ -0,0 +1,12 @@
# /bin/sh
cd common
docker build -t satcomp-mallob:common .
cd ..
cd leader
docker build -t satcomp-mallob:leader .
cd ..
cd worker
docker build -t satcomp-mallob:worker .
cd ..

View File

@ -0,0 +1,13 @@
################### Build Mallob
FROM satcomp-infrastructure:common
USER root
# Install required softwares
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt install -y cmake build-essential zlib1g-dev libopenmpi-dev wget unzip build-essential zlib1g-dev cmake python3 build-essential gfortran wget curl
# Build Mallob
RUN git clone https://github.com/domschrei/mallob && cd mallob && git checkout interface && git checkout 35794bfa9d650f7c465c08cd0073a71833119162 && cd ..
WORKDIR mallob
RUN cd lib && bash fetch_and_build_sat_solvers.sh klcy && cd ..
RUN mkdir build
RUN cd build && cmake -DCMAKE_BUILD_TYPE=RELEASE -DMALLOB_USE_RESTRICTED=1 .. && VERBOSE=1 make && cd ..

View File

@ -0,0 +1,20 @@
################### Use Mallob
FROM satcomp-mallob:common AS builder
USER root
################### Extract Mallob in run stage
FROM satcomp-infrastructure:leader AS mallob_liaison
WORKDIR /
# Copy mallob and solver scripts
COPY --from=builder /mallob/build/mallob mallob
COPY --from=builder /mallob/build/mallob_sat_process mallob_sat_process
COPY --from=builder /mallob/build/mallob_process_dispatcher mallob_process_dispatcher
COPY --chown=ecs-user /init_solver.sh /competition
COPY --chown=ecs-user /run_solver.sh /competition
COPY --chown=ecs-user /solver /competition
USER ecs-user
RUN chmod +x /competition/init_solver.sh
RUN chmod +x /competition/run_solver.sh
RUN chmod +x /competition/solver

View File

@ -0,0 +1,7 @@
#!/bin/bash
# start sshd
/usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config &
# run solver
/competition/solver /rundir

View File

@ -0,0 +1,10 @@
#!/bin/bash
export OMPI_MCA_btl_vader_single_copy_mechanism=none
export RDMAV_FORK_SAFE=1
mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --hostfile $1 --bind-to none mallob -mono=$2 \
-zero-only-logging -sleep=1000 -t=4 -appmode=fork -nolog -v=3 -interface-fs=0 -trace-dir=competition \
-pipe-large-solutions=0 -processes-per-host=$processes_per_host -regular-process-allocation \
-max-lits-per-thread=50000000 -strict-clause-length-limit=20 -clause-filter-clear-interval=500 \
-max-lbd-partition-size=2 -export-chunks=20 -clause-buffer-discount=1.0 -satsolver=k

View File

@ -0,0 +1,115 @@
#!/usr/bin/env python3
import json
import logging
import os
import subprocess
import sys
import threading
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
class Runner:
def __init__(self, request_directory: str):
self.logger = logging.getLogger("Runner")
self.logger.setLevel(logging.INFO)
self.request_directory = request_directory
os.environ['PYTHONUNBUFFERED'] = "1"
def process_stream(self, stream, str_name, file_handle):
line = stream.readline()
while line != "":
self.logger.info(f"{str_name}: {line}")
file_handle.write(line)
line = stream.readline()
def run(self, cmd: list):
self.logger.info("Running command: %s", str(cmd))
stdout_target_loc = os.path.join(self.request_directory, "stdout.log")
stderr_target_loc = os.path.join(self.request_directory, "stderr.log")
with open(stdout_target_loc, "w") as stdout_handle:
with open(stderr_target_loc, "w") as stderr_handle:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, start_new_session=True)
stdout_t = threading.Thread(target = self.process_stream, args=(proc.stdout, "STDOUT", stdout_handle))
stderr_t = threading.Thread(target = self.process_stream, args=(proc.stderr, "STDERR", stderr_handle))
stdout_t.start()
stderr_t.start()
return_code = proc.wait()
stdout_t.join()
stderr_t.join()
return {
"stdout": stdout_target_loc,
"stderr": stderr_target_loc,
"return_code": return_code,
"output_directory": self.request_directory
}
def get_input_json(self):
input = os.path.join(self.request_directory, "input.json")
with open(input) as f:
return json.loads(f.read())
def create_hostfile(self,ips, request_dir):
hostfile_path = os.path.join(request_dir, 'combined_hostfile')
with open(hostfile_path, 'w+') as f:
for ip in ips:
f.write(f'{ip} slots=4') # distributed default
#f.write(f'{ip} slots=16') # parallel default
f.write('\n')
return hostfile_path
def get_command(self, input_json):
formula_file = input_json.get("formula_file")
worker_node_ips = input_json.get("worker_node_ips", [])
combined_hostfile = self.create_hostfile(worker_node_ips, self.request_directory)
run_list = ["/competition/run_solver.sh"]
run_list.append(combined_hostfile)
run_list.append(formula_file)
return run_list
class MallobParser:
@staticmethod
def get_result(output_file):
"""
TODO: Participants should replace this with something more robust for their own solver!
"""
with open(output_file) as f:
raw_logs = f.read()
if "found result UNSAT" in raw_logs:
return 20
elif "found result SAT" in raw_logs:
return 10
elif "UNKNOWN" in raw_logs:
return 0
else:
return -1
if __name__ == "__main__":
request_directory = sys.argv[1]
runner = Runner(request_directory)
input_json = runner.get_input_json()
cmd = runner.get_command(input_json)
print (f"cmd: {cmd}")
output = runner.run(cmd)
result = MallobParser.get_result(output["stdout"])
logging.info(f"RESULT: {result}")
solver_output = {
"return_code": result,
"artifacts": {
"stdout_path": output["stdout"],
"stderr_path": output["stderr"]
}
}
with open(os.path.join(request_directory, "solver_out.json"), "w+") as f:
f.write(json.dumps(solver_output))

View File

@ -0,0 +1,12 @@
# /bin/sh
cd common
docker build --no-cache -t satcomp-mallob:common .
cd ..
cd leader
docker build --no-cache -t satcomp-mallob:leader .
cd ..
cd worker
docker build --no-cache -t satcomp-mallob:worker .
cd ..

View File

@ -0,0 +1,17 @@
################### Use Mallob
FROM satcomp-mallob:common AS builder
USER root
################### Extract Mallob in run stage
FROM satcomp-infrastructure:worker AS mallob_liaison
WORKDIR /
# Copy mallob
COPY --from=builder /mallob/build/mallob mallob
COPY --from=builder /mallob/build/mallob_sat_process mallob_sat_process
COPY --from=builder /mallob/build/mallob_process_dispatcher mallob_process_dispatcher
COPY --chown=ecs-user /init_solver.sh /competition
COPY --chown=ecs-user /worker /competition
USER ecs-user
RUN chmod +x /competition/init_solver.sh
RUN chmod +x /competition/worker

View File

@ -0,0 +1,5 @@
#!/bin/bash
# start sshd
/usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config &

View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
import json
import subprocess
import time
import logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def check_process_status():
status = 'READY'
output = subprocess.check_output(['ps', 'ax', '-ocomm']).decode("utf-8").strip()
if 'orted' in output:
status = 'BUSY'
if 'sshd' not in output:
status = 'ERROR'
return status
def update_worker_status():
while True:
status = check_process_status()
data = {"status": status, "timestamp": int(time.time())}
with open(
"/competition/worker_node_status.json",
"w+",
) as statusfile:
statusfile.write(json.dumps(data))
time.sleep(1)
if __name__ == "__main__":
update_worker_status()

View File

@ -0,0 +1,2 @@
leader slots=4
worker slots=4

View File

@ -0,0 +1,4 @@
c
p cnf 3 2
1 -3 0
2 3 -1 0

View File

@ -0,0 +1,48 @@
# /bin/sh
# run_dist_leader.sh: Use this script to launch Docker container
# for parallel Mallob. Arguments:
# $1: docker image name (assumes a "leader" tag)
# $2: formula file (relative to HOST_RUNDIR)
# check number arguments
if [[ $# -lt 2 ]]; then
echo "run_parallel.sh: needs two arguments: <docker_image_name> <formula file>"
exit 1
fi
# user config
DOCKER_NETWORK="mallob-test"
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
# script config
NODE_TYPE="leader"
DOCKER_RUNDIR="/rundir"
# summary
echo "run_dist_worker.sh, running with"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $1:$NODE_TYPE"
echo " formula file: $HOST_RUNDIR/$2"
echo " host rundir: $HOST_RUNDIR"
echo " docker rundir: $DOCKER_RUNDIR"
# does host directory exist?
if [[ ! -d "$HOST_RUNDIR" ]]; then
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
exit 1
fi
# does formula file exist?
if [[ ! -f "$HOST_RUNDIR/$2" ]]; then
echo "ERROR - unable to read formula file '$HOST_RUNDIR/$2'"
exit 1
fi
# create input.json
echo -e "{\n \"formula_file\": \"$DOCKER_RUNDIR/$2\",\n \"worker_node_ips\": [\"leader\", \"worker\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"smt2\",\n \"solver_argument_list\": []\n}" > "$HOST_RUNDIR/input.json"
# Run docker image. See comments in run_parallel.sh
#
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1,33 @@
# /bin/sh
# check number arguments
if [[ $# -lt 1 ]]; then
echo "configure_parallel.sh: needs one argument: <docker_image_name>"
exit 1
fi
# user config
DOCKER_NETWORK="mallob-test"
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
# config to match other scripts
NODE_TYPE="worker"
DOCKER_RUNDIR="/rundir"
# summary
echo "run_dist_worker.sh, running with"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $1:$NODE_TYPE"
echo " host rundir: $HOST_RUNDIR"
echo " docker rundir: $DOCKER_RUNDIR"
# does host directory exist?
if [[ ! -d "$HOST_RUNDIR" ]]; then
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
exit 1
fi
# Run docker image. See comments in run_parallel.sh
#
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1,60 @@
# /bin/bash
# run_parallel.sh: Use this script to launch Docker container
# for parallel Mallob. Arguments:
# $1: docker image name (assumes a "leader" tag)
# $2: formula file (relative to HOST_RUNDIR)
# check number arguments
if [[ $# -lt 2 ]]; then
echo "run_parallel.sh: needs two arguments: <docker_image_name> <formula file>"
exit 1
fi
# user config
DOCKER_NETWORK="mallob-test"
HOST_RUNDIR="/home/rbtjones/dev/sat23/docker/runner/experiment"
# config to match other scripts
NODE_TYPE="leader"
DOCKER_RUNDIR="/rundir"
# summary
echo "run_parallel.sh, running with"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $1:$NODE_TYPE"
echo " formula file: $HOST_RUNDIR/$2"
echo " host rundir: $HOST_RUNDIR"
echo " docker rundir: $DOCKER_RUNDIR"
# does host directory exist?
if [[ ! -d "$HOST_RUNDIR" ]]; then
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
exit 1
fi
# does formula file exist?
if [[ ! -f "$HOST_RUNDIR/$2" ]]; then
echo "ERROR - unable to read formula file '$HOST_RUNDIR/$2'"
exit 1
fi
#
# create input.json
#
echo -e "{\n\"formula_file\": \"$DOCKER_RUNDIR/$2\",\n\"worker_node_ips\": [\"leader\"]\n}" > "$HOST_RUNDIR/input.json"
#
# Run docker image as container. Arguments:
# -i // keep STDIN open even if not attached
# --shm-size=XXg // setting up shared memory (for clause DB)
# // default (1G?) is too small
# --network mallob-test // use the Docker bridge network called 'mallob-test'
# --entrypoint bash // when the container starts, it runs bash
# --rm // remove the docker container when exiting
# -v // mount $HOST_RUNDIR in the host filesystem at '/$DOCKER_RUNDIR' in docker image
# -t // Docker image name for leader (script hard-codes 'leader' tag)
# -c <bash_command> // gets passed to bash shell
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:/$DOCKER_RUNDIR -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1,7 @@
{
"formula_file": "/rundir/experiment/test.cnf",
"worker_node_ips": ["leader", "worker"],
"timeout_seconds": "1000",
"formula_language": "",
"solver_argument_list": [""]
}

View File

@ -0,0 +1,63 @@
#!/bin/zsh
# run_dist_leader.sh: Use this script to launch Docker container
# for parallel Mallob. Arguments:
# $1: docker image name (assumes a "leader" tag)
# $2: formula file (relative to HOST_RUNDIR)
# $*: solver arguments (passed directly to solver)
# check number arguments
if [[ $# -lt 2 ]]; then
echo "run_dist_leader.sh usage: <docker_image_name> <formula_file> [solver arguments]"
exit 1
fi
# read first two command-line arguments
DOCKER_IMAGE_NAME=$1
FORMULA_FILENAME=$2
shift 2
SOLVER_ARGS_SPACED=$*
SOLVER_ARGS="\"${SOLVER_ARGS_SPACED// /\", \"}\""
# default config; user can replace DOCKER_NETWORK / HOST_RUNDIR if desired.
DOCKER_NETWORK="mallob-test"
SCRIPTPATH=$(readlink -f "$0")
SCRIPTDIR=$(dirname "$SCRIPTPATH")
HOST_RUNDIR="$SCRIPTDIR"
# script config
NODE_TYPE="leader"
DOCKER_RUNDIR="/rundir"
DOCKER_IMAGE="$DOCKER_IMAGE_NAME:$NODE_TYPE"
HOST_FORMULA_FILE="$HOST_RUNDIR/$FORMULA_FILENAME"
DOCKER_FORMULA_FILE="$DOCKER_RUNDIR/$FORMULA_FILENAME"
# summary
echo "run_dist_worker.sh, running with:"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $DOCKER_IMAGE"
echo " formula file: $FORMULA_FILENAME"
echo " host rundir: $HOST_RUNDIR"
echo " docker rundir: $DOCKER_RUNDIR"
# does host directory exist?
if [[ ! -d "$HOST_RUNDIR" ]]; then
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
exit 1
fi
# does formula file exist?
if [[ ! -f "$HOST_FORMULA_FILE" ]]; then
echo "ERROR - unable to read formula file '$HOST_FORMULA_FILE'"
exit 1
fi
# create input.json, all extra command-line args get hoovered as solver args
echo -e "{\n \"formula_file\": \"$DOCKER_FORMULA_FILE\",\n \"worker_node_ips\": [\"leader\", \"worker\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"\",\n \"solver_argument_list\": [$SOLVER_ARGS]\n}" > "$HOST_RUNDIR/input.json"
# Run docker image. See comments in run_parallel.sh
#
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:$DOCKER_RUNDIR -t $DOCKER_IMAGE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1,24 @@
#!/bin/zsh
# check number arguments
if [[ $# -lt 1 ]]; then
echo "configure_parallel.sh: needs one argument: <docker_image_name>"
exit 1
fi
# default config; user can replace DOCKER_NETWORK if desired.
DOCKER_NETWORK="mallob-test"
# config to match other scripts
NODE_TYPE="worker"
# summary
echo "run_dist_worker.sh, running with"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $1:$NODE_TYPE"
# Run docker image. See comments in run_parallel.sh
#
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -t $1:$NODE_TYPE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1,74 @@
#!/bin/bash
# run_parallel.sh: Use this script to launch Docker container
# for parallel Mallob. Arguments:
# $1: docker image name (assumes a "leader" tag)
# $2: formula file (relative to HOST_RUNDIR)
# $*: solver arguments (passed directly to solver)
# check number arguments
if [[ $# -lt 2 ]]; then
echo "run_parallel.sh usage: <docker_image_name> <formula_file> [solver arguments]"
exit 1
fi
# read first two command-line arguments
DOCKER_IMAGE_NAME=$1
FORMULA_FILENAME=$2
shift 2
SOLVER_ARGS_SPACED=$*
SOLVER_ARGS="\"${SOLVER_ARGS_SPACED// /\", \"}\""
# default config; user can replace DOCKER_NETWORK / HOST_RUNDIR if desired.
DOCKER_NETWORK="mallob-test"
SCRIPTPATH=$(readlink -f "$0")
SCRIPTDIR=$(dirname "$SCRIPTPATH")
HOST_RUNDIR="$SCRIPTDIR"
# config to match other scripts
NODE_TYPE="leader"
DOCKER_RUNDIR="/rundir"
DOCKER_IMAGE="$DOCKER_IMAGE_NAME:$NODE_TYPE"
HOST_FORMULA_FILE="$HOST_RUNDIR/$FORMULA_FILENAME"
DOCKER_FORMULA_FILE="$DOCKER_RUNDIR/$FORMULA_FILENAME"
# summary
echo "run_parallel.sh, running with"
echo " node type: $NODE_TYPE"
echo " docker network: $DOCKER_NETWORK"
echo " docker image: $DOCKER_IMAGE"
echo " formula file: $FORMULA_FILENAME"
echo " host rundir: $HOST_RUNDIR"
echo " docker rundir: $DOCKER_RUNDIR"
# does host directory exist?
if [[ ! -d "$HOST_RUNDIR" ]]; then
echo "ERROR - unable to reach host run directory '$HOST_RUNDIR'"
exit 1
fi
# does formula file exist?
if [[ ! -f "$HOST_FORMULA_FILE" ]]; then
echo "ERROR - unable to read formula file '$HOST_FORMULA_FILE'"
exit 1
fi
#
# create input.json
#
echo -e "{\n \"formula_file\": \"$DOCKER_FORMULA_FILE\",\n \"worker_node_ips\": [\"leader\"],\n \"timeout_seconds\": \"1000\",\n \"formula_language\": \"\",\n \"solver_argument_list\": [$SOLVER_ARGS]\n}" > "$HOST_RUNDIR/input.json"
#
# Run docker image as container. Arguments:
# -i // keep STDIN open even if not attached
# --shm-size=XXg // setting up shared memory (for clause DB)
# // default (1G?) is too small
# --network mallob-test // use the Docker bridge network called 'mallob-test'
# --entrypoint bash // when the container starts, it runs bash
# --rm // remove the docker container when exiting
# -v // mount $HOST_RUNDIR in the host filesystem at '/$DOCKER_RUNDIR' in docker image
# -t // Docker image name for leader (script hard-codes 'leader' tag)
# -c <bash_command> // gets passed to bash shell
docker run -i --shm-size=32g --name $NODE_TYPE --network $DOCKER_NETWORK --entrypoint bash --rm -v $HOST_RUNDIR:$DOCKER_RUNDIR -t $DOCKER_IMAGE -c "/competition/init_solver.sh; exec bash"

View File

@ -0,0 +1 @@
{"return_code": 10, "artifacts": {"stdout_path": "/rundir/stdout.log", "stderr_path": "/rundir/stderr.log"}}

View File

@ -0,0 +1 @@
Warning: Permanently added 'worker,172.18.0.2' (ECDSA) to the list of known hosts.

View File

@ -0,0 +1,69 @@
c
c Mallob -- a parallel and distributed platform for job scheduling, load balancing, and SAT solving
c Designed by P. Sanders and D. Schreiber 2018-2022
c Developed by D. Schreiber 2019-2022
c
c 0.000 0 Program options: -0o=1 -J=1 -T=0 -ajpc=0 -ans=0 -appmode=fork -ba=4 -bicg=4 -c=1 -cbbs=1500 -cbdf=1 -cfci=500 -cg=1 -ch=0 -chaf=5 -checksums=0 -chstms=10 -cmp=0 -colors=0 -dc=0 -ddd=0 -delaymonkey=0 -derandomize=1 -evu=0 -fapii=0 -g=0 -gclls=0 -h=0 -hbbfs=10 -hubfs=9999999 -huca=-1 -interface-fs=0 -interface-ipc=0 -isp=0 -jc=4 -jcl=0 -jcup=0 -jjp=0 -jwl=0 -l=1 -latencymonkey=0 -ljpc=32 -mbfsd=4 -mbt=1000000 -mcips=10 -md=0 -mid=0 -mjps=0 -mlbdps=2 -mlpt=50000000 -mmpi=0 -mono=/rundir/experiment/test.cnf -nce=20 -os=0 -p=0.01 -phasediv=1 -pls=0 -pph=0 -q=0 -qcll=8 -qlbdl=2 -ril=0 -rpa=1 -rs=1 -rto=0 -s=1 -satsolver=k -scll=20 -seed=0 -sjd=0 -slbdl=30 -sleep=1000 -t=4 -trace-dir=competition -v=3 -w=-1 -wam=10000 -warmup=0 -watchdog=1 -wr=0 -y=1 -yield=0
c 0.000 0 Mallob rls pid=18 on host d6efa65e5077
c 0.000 0 8 workers, 1 clients
c 0.126 0 Created client communicator
c 0.225 0 Created worker communicator
c 0.225 0 My bounce alternatives: 5 7 6 2
c 0.225 0 Set up API at src/interface/api/api_connector.hpp
c 0.225 0 <Janitor> tid=79
c 0.235 0 <Reader> Starting
c 0.297 0 Machine color 0 with 4 total workers (my rank: 0)
c 0.297 0 I Mapping job "admin.mono-job.json" to internal ID #1
c 0.297 0 <Reader> [T] Reading job #1 rev. 0 {/rundir/experiment/test.cnf} ...
c 0.297 0 <Reader> [T] Initialized job #1 {/rundir/experiment/test.cnf} in 0.000s: 7 lits w/ separators, 0 assumptions
c 0.298 0 Introducing job #1 rev. 0 : r.#1:0 rev. 0 <- [0] born=0.298 hops=0 epoch=-1 => [1]
c 0.301 0 Scheduling r.#1:0 rev. 0 <- [0] born=0.298 hops=1 epoch=-1 on [3] (latency: 0.00327s)
c 0.312 0 ADOPT r.#1:2 rev. 0 <- [3] born=0.336 hops=0 epoch=2 mode=1 <= [3]
c 0.312 0 COMMIT #1 -> #1:2
c 0.313 0 #1:2 growing: r.#1:5 rev. 0 <- [0] born=0.313 hops=0 epoch=2 => [6]
c 0.313 0 #1:2 growing: r.#1:6 rev. 0 <- [0] born=0.313 hops=0 epoch=2 => [4]
c 0.314 0 UNCOMMIT #1:2
c 0.314 0 LOAD 1 (+#1:2)
c 0.314 0 EXECUTE #1:2 <= [3]
c 0.326 0 <#1> Mallob SAT engine rls pid=94
c 0.326 0 <#1> S8.0 Diversifying 8
c 0.326 0 <#1> S8.0 found result SAT for rev. 0
c 0.326 0 <#1> S9.0 Diversifying 9
c 0.326 0 <#1> S9.0 found result SAT for rev. 0
c 0.326 0 <#1> S10.0 Diversifying 10
c 0.327 0 <#1> S11.0 Diversifying 11
c 0.327 0 <#1> S11.0 found result SAT for rev. 0
c 0.327 0 <#1> S10.0 found result SAT for rev. 0
c 0.337 0 LOAD 0 (-#1:2)
c 0.337 0 #1:2 desires fulfilled=0.0000 latency={num:0 min:0.00000 med:0.00000 avg:0.00000 max:0.00000}
c 0.337 0 <#1> END S8 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
c 0.337 0 <#1> END S8 clenhist prod total:0 0
c 0.337 0 <#1> END S8 clenhist digd total:0 0
c 0.337 0 <#1> END S9 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
c 0.337 0 <#1> END S9 clenhist prod total:0 0
c 0.337 0 <#1> END S9 clenhist digd total:0 0
c 0.337 0 <#1> END S10 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
c 0.337 0 <#1> END S10 clenhist prod total:0 0
c 0.337 0 <#1> END S10 clenhist digd total:0 0
c 0.337 0 <#1> END S11 pps:3 dcs:2 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
c 0.337 0 <#1> END S11 clenhist prod total:0 0
c 0.337 0 <#1> END S11 clenhist digd total:0 0
c 0.337 0 <#1> END pps:12 dcs:8 cfs:0 rst:0 prod:0 (flt:0 adm:0 drp:0) recv:0 (flt:0 digd:0 drp:0) + intim:0/0
c 0.337 0 <#1> END exp:0/0 drp:0(0.000000) pflt:0 sflt:0
c 0.337 0 <#1> clenhist prod total:0 0
c 0.337 0 <#1> clenhist flfl total:0 0
c 0.337 0 <#1> clenhist admt total:0 0
c 0.337 0 <#1> clenhist drpd total:0 0
c 0.337 0 <#1> clenhist dltd total:0 0
c 0.337 0 <#1> clenhist retd total:0 0
c 0.348 0 RESPONSE_TIME #1 0.049750 rev. 0
c 0.348 0 SOLUTION #1 SAT rev. 0
s SATISFIABLE
v 1 2 3 0
c 0.349 0 Received exit signal <= [0]
c 0.349 0 Terminating.
c 0.444 0 busytime=0.023
c 0.544 0 STATS treegrowth_latencies num:0 min:0.000000 max:0.000000 med:0.000000 mean:0.000000
c 0.545 0 STATS balancing_latencies num:0 min:0.000000 max:0.000000 med:0.000000 mean:0.000000
c 0.545 0 <Reader> Stopping
c 0.794 0 Exiting happily

View File

@ -0,0 +1,4 @@
# /bin/sh
docker build -f satcomp-common/Dockerfile -t satcomp-infrastructure:common .
docker build -f satcomp-leader/Dockerfile -t satcomp-infrastructure:leader .
docker build -f satcomp-worker/Dockerfile -t satcomp-infrastructure:worker .

View File

@ -0,0 +1,14 @@
FROM ubuntu:20.04
RUN useradd -ms /bin/bash ecs-user
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt install -y python3-pip git openssh-server openssh-server iproute2 openmpi-bin openmpi-common iputils-ping \
&& mkdir /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
&& setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd \
&& chown -R ecs-user /etc/ssh/ \
&& su - ecs-user -c \
'ssh-keygen -q -t rsa -f ~/.ssh/id_rsa -N "" \
&& cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys \
&& cp /etc/ssh/sshd_config ~/.ssh/sshd_config \
&& sed -i "s/UsePrivilegeSeparation yes/UsePrivilegeSeparation no/g" ~/.ssh/sshd_config \
&& printf "Host *\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config'

View File

@ -0,0 +1,14 @@
FROM satcomp-infrastructure:common
RUN pip3 install flask waitress boto3 pytz polling2
COPY --chown=ecs-user satcomp-solver-resources/ /opt/amazon/
COPY --chown=ecs-user satcomp-leader/resources/solver /competition/solver
COPY --chown=ecs-user satcomp-leader/resources/leader /competition/leader
ENV PYTHONPATH=/opt/amazon:.:
RUN chmod u+x /competition/solver
RUN chmod u+x /competition/leader
RUN ls /opt/amazon/arg_satcomp_solver_base
RUN chmod u+x /opt/amazon/arg_satcomp_solver_base/leader_entrypoint.py
RUN service ssh start
USER ecs-user
EXPOSE 22
ENTRYPOINT /usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config & /opt/amazon/arg_satcomp_solver_base/leader_entrypoint.py

View File

@ -0,0 +1,18 @@
#!/usr/bin/env python3
""" This is a test script that saves leader status to json file every second"""
import json
import time
def update_leader_status():
while True:
data = {"status": "READY", "timestamp": int(time.time())}
with open(
"/competition/leader_node_status.json",
"w",
) as statusfile:
statusfile.write(json.dumps(data))
time.sleep(1)
if __name__ == "__main__":
update_leader_status()

View File

@ -0,0 +1,14 @@
#!/bin/bash
echo "This is a test! Here is what the log would look like"
echo " \
{
\"return_code\": 0,
\"artifacts\": {
\"stdout_path\": \"$1/solver_stdout.log\",
\"stderr_path\": \"$1/solver_stderr.log\"
}
}
" > $1/solver_out.json
echo "This is the stdout log" >> $1/solver_stdout.log
echo "This is the stderr log" >> $1/solver_stderr.log

View File

@ -0,0 +1,51 @@
"""
This module runs in the background of the Leader Base Container.
It updates the manifest periodically.
"""
import json
import logging
import os
import socket
import threading
import uuid
import time
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest, NodeType
from arg_satcomp_solver_base.utils import FileOperations
class LeaderTimeoutException(Exception):
pass
class LeaderStatusChecker(threading.Thread):
"""Thread that runs in the background publishing leader READY status & IP address by
updating manifest every second"""
leader_poll_sleep_time = 1
file_operations: FileOperations = FileOperations()
def __init__(self, node_manifest: DynamodbManifest):
threading.Thread.__init__(self)
self.node_manifest = node_manifest
self.logger = logging.getLogger("LeaderPoller")
self.logger.setLevel(logging.DEBUG)
self.setDaemon(True)
def update_manifest(self, uuid: str, local_ip_address: str, status: str):
self.logger.info(f"Giving node heartbeat for node {uuid} with ip {local_ip_address}")
self.node_manifest.register_node(str(uuid), local_ip_address, status, NodeType.LEADER.name)
# TODO: This should be updated so that the leader emits its status (just like the workers)
# Perhaps for SAT-Comp 2023
def run(self):
local_ip_address = socket.gethostbyname(socket.gethostname())
node_uuid = uuid.uuid4()
self.logger.info(f"Registering leader node {node_uuid} with ip {local_ip_address}")
while True:
current_timestamp = int(time.time())
self.update_manifest(node_uuid, local_ip_address, "READY")
self.logger.info("Leader updated status")
time.sleep(self.leader_poll_sleep_time)

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""This file is the main entrypoint for the Leader Node Base Container"""
import logging
import os
import socket
import subprocess
import sys
from time import sleep
sys.path.append('/opt/amazon/lib/python3.8/site-packages/')
from arg_satcomp_solver_base.solver.command_line_solver import CommandLineSolver
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue
from arg_satcomp_solver_base.poller.poller import Poller
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
from arg_satcomp_solver_base.leader.leader import LeaderStatusChecker
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
QUEUE_NAME = os.getenv('SQS_QUEUE_NAME')
OUTPUT_QUEUE_NAME = os.getenv('SQS_OUTPUT_QUEUE_NAME')
SATCOMP_BUCKET_NAME = os.getenv('SATCOMP_BUCKET_NAME')
def run_leader(path):
cmd = os.path.join(path, "leader")
stdout = os.path.join(path, "leader_stdout.log")
stderr = os.path.join(path, "leader_stderr.log")
with open(stdout, "wb") as out, open(stderr, "wb") as err:
subprocess.Popen(cmd, stdout=out, stderr=err)
if __name__ == "__main__":
logger = logging.getLogger("Leader Entrypoint")
logger.setLevel(logging.DEBUG)
dir = "/competition"
logger.info("Running leader healthcheck")
run_leader(dir) # Run script to save status to leader_node_status.json
sleep(1) # Wait for creation of leader_node_status.json file
logger.info("Getting input queue: %s", QUEUE_NAME)
sqs_input_queue = SqsQueue.get_sqs_queue(QUEUE_NAME)
logger.info("Getting output queue: %s", QUEUE_NAME)
sqs_output_queue = SqsQueue.get_sqs_queue(OUTPUT_QUEUE_NAME)
logger.info(f"Bucket name: {SATCOMP_BUCKET_NAME}")
logger.info("Getting task end notifier")
task_end_notifier = TaskEndNotifier.get_task_end_notifier()
logger.info("Getting node manifest")
node_manifest = DynamodbManifest.get_dynamodb_manifest()
logger.info("Building command line solver to run solver script /competition/solver")
solver = CommandLineSolver(f"{dir}/solver")
logger.info("Getting local IP address")
local_ip_address = socket.gethostbyname(socket.gethostname())
logger.info("Starting leader status checker")
leader_status = LeaderStatusChecker(node_manifest)
leader_status.start()
logger.info("starting poller")
poller = Poller(1, local_ip_address, sqs_input_queue, sqs_output_queue, node_manifest, task_end_notifier, solver, SATCOMP_BUCKET_NAME)
poller.start()
poller.join()

View File

@ -0,0 +1,97 @@
import logging
import time
from enum import Enum
import boto3
class NodeStatus(Enum):
READY = "READY"
BUSY = "BUSY"
FAILED = "FAILED"
class NodeType(Enum):
WORKER = "WORKER"
LEADER = "LEADER"
class DynamodbManifest:
"""
Node manifest for storing IP and status of satcomp nodes based on a DynamoDB Table
"""
def __init__(self, table, node_expiration_time=120):
self.table = table
self.node_expiration_time = node_expiration_time
self.logger = logging.getLogger(__name__)
def register_node(self, node_id: str, ip_address: str, status: str, node_type: str):
"""
Register node with node manifest table
:param node_id: Unique ID for this node
:param ip_address: IP address for this node
:param status: Current status of Node
:param node_type: Node type, either WORKER or LEADER
:return:
"""
current_time = int(time.time())
self.table.update_item(
Key={
'nodeId': node_id
},
UpdateExpression="set nodeIp = :ipVal, #stskey = :stsVal, nodeType = :nodeTypeVal,"
" lastModified = :lastModifiedVal",
ExpressionAttributeNames={
"#stskey": "status"
},
ExpressionAttributeValues={
":ipVal": ip_address,
":stsVal": status,
":nodeTypeVal": node_type,
":lastModifiedVal": current_time
}
)
def get_all_ready_nodes(self, nodeTypeVal):
"""
Returns all nodes of type nodeTypeVal that are currently set to READY status
:return:
"""
expiration_time = int(time.time() - self.node_expiration_time)
nodes = self.table.scan(
Select="ALL_ATTRIBUTES",
FilterExpression="#nodeStatusKey = :nodeStatusVal and #nodeTypeKey = :nodeTypeVal and "
"#lastModifiedKey > :lastModifiedVal",
ExpressionAttributeNames={
"#nodeStatusKey": "status",
"#nodeTypeKey": "nodeType",
"#lastModifiedKey": "lastModified"
},
ExpressionAttributeValues={
":nodeStatusVal": "READY",
":nodeTypeVal": nodeTypeVal,
":lastModifiedVal": expiration_time
},
ConsistentRead=True
)
return nodes["Items"]
def get_all_ready_worker_nodes(self):
"""
Returns all worker nodes that are currently set to READY status
:return:
"""
return self.get_all_ready_nodes("WORKER")
def get_all_ready_leader_nodes(self):
"""
Returns a list of zero or one leader node currently set to READY status
:return:
"""
return self.get_all_ready_nodes("LEADER")
@staticmethod
def get_dynamodb_manifest():
dynamodb_resource = boto3.resource("dynamodb")
table = dynamodb_resource.Table("SatCompNodeManifest")
return DynamodbManifest(table)

View File

@ -0,0 +1,214 @@
"""
This module is a Poller which will run in the background of the satcomp Base Container.
It will poll an SQS queue looking for problems to solve and send them to solver implementations.
"""
import json
import logging
import threading
from enum import Enum
from json import JSONDecodeError
from time import sleep
import os
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
from arg_satcomp_solver_base.s3_file_system.s3_file_system import S3FileSystem, S3FileSystemException
from arg_satcomp_solver_base.solver.command_line_solver import Solver, SolverException
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue, SqsQueueException
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
from arg_satcomp_solver_base.utils import FileOperations
MOUNT_POINT = '/tmp'
class PollerStatus(Enum):
HEALTHY = "HEALTHY"
class PollerTimeoutException(Exception):
pass
class Poller(threading.Thread):
"""Thread that runs in the background trying to pull work off of an SQS
queue and submitting that work to a solver"""
worker_poll_timeout = 120
worker_poll_sleep_time = 5
queue: SqsQueue
output_queue: SqsQueue
node_manifest: DynamodbManifest
solver: Solver
thread_id: int
ip_address: str
file_operations: FileOperations = FileOperations()
health_status: PollerStatus
s3_file_system: S3FileSystem = S3FileSystem.get_s3_file_system()
s3_bucket: str
task_end_notifier: TaskEndNotifier
def __init__(self, thread_id: int,
ip_address: str,
queue: SqsQueue,
output_queue: SqsQueue,
node_manifest: DynamodbManifest,
task_end_notifier: TaskEndNotifier,
solver: Solver,
s3_bucket: str):
threading.Thread.__init__(self)
self.queue = queue
self.output_queue = output_queue
self.node_manifest = node_manifest
self.task_end_notifier = task_end_notifier
self.solver = solver
self.thread_id = thread_id
self.ip_address = ip_address
self.logger = logging.getLogger("Poller")
self.logger.setLevel(logging.DEBUG)
self.health_status = PollerStatus.HEALTHY
self.s3_bucket = s3_bucket
def _is_valid_solver_request(self, solver_request):
# TODO improve message validation
# Expected message structure:
"""{
"formula" : {
"value" : <s3 url>,
"language": "SMTLIB2" | "DIMACS",
},
"solverConfig" : {
"solverName" : "",
"solverOptions" : [],
"taskTimeoutSeconds" : 5
},
"num_workers": 0
}"""
""" {
"formula": {
"value": "s3://834251193136-us-east-1-my-project/test.cnf",
"language": ""
},
"solverConfig": {
"solverName": "",
"solverOptions": [],
"taskTimeoutSeconds": 10
},
"num_workers": 0
}
"""
if solver_request is None:
return False
if "formula" not in solver_request:
return False
if "language" not in solver_request["formula"]:
return False
if "value" not in solver_request["formula"]:
return False
if "solverConfig" not in solver_request:
return False
if "solverName" not in solver_request["solverConfig"]:
return False
if "solverOptions" not in solver_request["solverConfig"]:
return False
if "taskTimeoutSeconds" not in solver_request["solverConfig"]:
return False
if "num_workers" not in solver_request:
return False
return True
def run(self):
# TODO: In the future we should make this more testable by separating the while loop
# from the thread code itself
# TODO: How do we want to handle client errors/internal server errors
while True:
# TODO: For now we are only handling problems that will fit in SQS message.
# In the future this will have to change
leader_working_directory = None
efs_shared_directory = None
try:
self.logger.info("Trying to get messages from queue: %s", self.queue.queue_name)
message = self.queue.get_message()
if message is None:
continue
msg_json = json.loads(message.read())
message_handle = message.msg.receipt_handle
self.logger.info("Got problem to solve from message with receipt handle: %s", message_handle)
message.delete()
self.logger.info("Deleted message from queue")
if not self._is_valid_solver_request(msg_json):
self.logger.error(f"Message {message.read()} is invalid. Skipping processing.")
continue
s3_uri = msg_json.get("formula").get("value")
timeout = msg_json.get("solverConfig").get("taskTimeoutSeconds")
formula_language = msg_json.get("formula").get("language")
solver_options = msg_json.get("solverConfig").get("solverOptions")
num_workers = msg_json.get("num_workers", 0)
self.logger.info("Waiting for worker nodes to come up")
self.logger.info(f"Task requests {num_workers} worker nodes")
workers = self.wait_for_worker_nodes(num_workers)
workers.append({"nodeIp": self.ip_address})
task_uuid = self.file_operations.generate_uuid()
efs_uuid_directory = self.file_operations.create_custom_directory(MOUNT_POINT, task_uuid)
self.logger.info("Created uuid directory in local container %s", efs_uuid_directory)
download_location = self.s3_file_system.download_file(s3_uri, efs_uuid_directory)
self.logger.info("Download problem to location: %s", download_location)
solver_response = self.solver.solve(download_location, efs_uuid_directory, workers, task_uuid, timeout, formula_language, solver_options)
solver_response["driver"]["s3_uri"] = s3_uri
self.logger.info("Solver response:")
self.logger.info(solver_response)
self.logger.info("Writing response to output queue")
self.output_queue.put_message(json.dumps(solver_response))
self.logger.info(f"Writing all files in request directory path: {efs_uuid_directory} to S3 bucket: {self.s3_bucket}")
s3_uri = "s3://" + self.s3_bucket + efs_uuid_directory
self.logger.info(f"S3 URI is: {s3_uri} and S3 bucket is: {self.s3_bucket}")
self.s3_file_system.upload_directory_tree(efs_uuid_directory, s3_uri)
self.logger.info(
"Cleaning up solver output directory %s",
solver_response["solver"].get("request_directory_path")
)
self.file_operations.remove_directory(solver_response["solver"].get("request_directory_path"))
self.logger.info("Cleaning up uuid directory: %s", efs_uuid_directory)
self.file_operations.remove_directory(efs_uuid_directory)
self.logger.info("Sending notification that solving is complete")
self.task_end_notifier.notify_task_end(self.ip_address)
except SolverException as e:
self.logger.error("Failed to run solver on message with receipt handle %s", message.msg.receipt_handle)
self.logger.exception(e)
except SqsQueueException as e:
self.logger.error("Failed to read from SQS queue: %s", self.queue.queue_name)
self.logger.exception(e)
except JSONDecodeError as e:
self.logger.error("Failed to run solver on message with receipt handle %s", message.msg.receipt_handle)
self.logger.error("Message is not valid Json: %s", message.read())
self.logger.exception(e)
except S3FileSystemException as e:
self.logger.error("Failed to download file from s3")
self.logger.exception(e)
def wait_for_worker_nodes(self, num_workers):
worker_nodes = []
wait_time = 0
self.logger.info(f"Waiting for {num_workers} worker nodes")
while len(worker_nodes) < num_workers:
worker_nodes = self.node_manifest.get_all_ready_worker_nodes()
sleep(self.worker_poll_sleep_time)
wait_time += self.worker_poll_sleep_time
if wait_time > self.worker_poll_timeout:
raise PollerTimeoutException(f"Timed out waiting for {num_workers} to report. "
f"Only {len(worker_nodes)} reported")
self.logger.info(f"Workers reported: {worker_nodes}")
return worker_nodes

View File

@ -0,0 +1 @@
# Marker file that indicates this package supports typing

View File

@ -0,0 +1,92 @@
"""Implementation of classes for interacting with S3"""
import logging
import ntpath
import os
from urllib.parse import urlparse
from botocore.exceptions import ClientError
class S3FileSystemException(Exception):
"""Exception for S3FileSystem errors"""
class S3FileSystem:
"""Class to represent an S3 storage"""
def __init__(self, s3_client):
self.s3_client = s3_client
self.logger = logging.getLogger("S3FileSystem")
self.logger.setLevel(logging.DEBUG)
def download_file(self, problem_uri: str, download_dest_folder: str):
"""
Function to download file based on user provided url
@param download_dest_folder: folder to download the problem to
@type problem_uri: s3 uri that is in the form of s3://bucket_name/path_to_file
"""
if problem_uri is None:
raise S3FileSystemException('s3 uri is empty')
problem_uri = urlparse(problem_uri, allow_fragments=False)
bucket_name = problem_uri.netloc
file_path = problem_uri.path.lstrip('/')
# Get the file name
file_name = ntpath.basename(file_path)
download_dest = os.path.join(download_dest_folder, file_name)
try:
self.logger.debug('Downloading file %s from bucket %s to destination %s', file_path, bucket_name, download_dest)
self.s3_client.download_file(bucket_name, file_path, download_dest)
except ClientError as e:
self.logger.error("Failed to download file from s3")
self.logger.exception(e)
raise S3FileSystemException(f"Failed to download file from s3 with download destination {download_dest}")
return download_dest
def upload_file(self, local_file_path: str, bucket_name: str, object_name: str):
try:
if object_name.startswith('/'):
object_name = object_name[1:]
self.logger.debug('Uploading file %s to bucket %s and file path %s', local_file_path, bucket_name, object_name)
self.s3_client.upload_file(local_file_path, bucket_name, object_name)
except ClientError as e:
self.logger.error("Failed to download file from s3")
self.logger.exception(e)
raise S3FileSystemException(f"Failed to upload file to s3 with upload destination {bucket_name}/{object_name}")
def upload_file_uri(self, local_file_path: str, s3_uri: str):
"""
Function to upload file to s3 from local filesystem based on user provided path
@param local_file_path: path to the problem to upload.
@type s3_uri: s3 uri that is in the form of s3://bucket_name/path_to_file for uploading.
"""
problem_uri = urlparse(s3_uri, allow_fragments=False)
bucket_name = problem_uri.netloc
object_name = problem_uri.path.lstrip('/')
# self.logger.info(f"bucket_name: {bucket_name}, object_name: {object_name}, ")
self.upload_file(local_file_path, bucket_name, object_name)
def upload_directory_tree(self, local_dir_path: str, s3_uri: str):
self.logger
problem_uri = urlparse(s3_uri, allow_fragments=False)
bucket_name = problem_uri.netloc
object_name_base = problem_uri.path
# iterate through all files and upload them to s3
for root, dirs, files in os.walk(local_dir_path):
for file in files:
file_name = os.path.join(root, file)
object_name = os.path.join(object_name_base, file_name)
self.upload_file(file_name, bucket_name, object_name)
@staticmethod
def get_s3_file_system():
import boto3
s3 = boto3.client('s3')
return S3FileSystem(s3)

View File

@ -0,0 +1,181 @@
import logging
from threading import TIMEOUT_MAX
import boto3
from botocore.exceptions import ClientError
import os
import argparse
import sys
import json
import time
sys.path.append('../')
from arg_satcomp_solver_base.sqs_queue.sqs_queue import SqsQueue
class SolverTimeoutException(Exception):
pass
class S3ProblemStore:
"""Class to represent S3 storage location"""
def __init__(self, s3_resource, formula_bucket, logger):
self.s3_resource = s3_resource
self.logger = logger
self.formula_bucket = formula_bucket
def list_cnf_file_s3_path_pairs(self):
try:
self.logger.debug(f'Attempting to list files for bucket {self.formula_bucket}')
bkt = self.s3_resource.Bucket(self.formula_bucket)
pairs = [(bkt_object.key, f's3://{self.formula_bucket}/{bkt_object.key}') for bkt_object in bkt.objects.all()]
return pairs
except ClientError as e:
self.logger.error(f"Failed to list s3 bucket objects from {self.formula_bucket}")
self.logger.exception(e)
raise
def get_bucket(self):
return self.formula_bucket
@staticmethod
def get_s3_file_system(session, formula_bucket, logger):
s3 = session.resource('s3')
return S3ProblemStore(s3, formula_bucket, logger)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# maximum retries on non-success (other than timeout)
RETRIES_MAX = 2
TIMEOUT = 1030
class ProblemRunner:
"""Class to run a problem set through a distributed solver"""
def __init__(self, problem_queue, result_queue, s3_problem_store, args, logger):
self.num_workers = args.num_workers
self.clean_first = args.clean_first
self.json_file = args.json_file
self.s3_problem_store = s3_problem_store
self.problem_queue = problem_queue
self.result_queue = result_queue
self.logger = logger
def run_one_problem(self, s3_uri):
msg = {"s3_uri": s3_uri, "num_workers": self.num_workers}
msg_str = json.dumps(msg, indent = 4)
done = False
retries = 0
while not done:
self.problem_queue.put_message(msg_str)
start_time = time.perf_counter()
result = self.result_queue.get_message()
while result is None:
self.logger.info(f"Awaiting completion for file: {s3_uri}")
end_time = time.perf_counter()
if end_time - start_time > TIMEOUT:
raise SolverTimeoutException(f"Client exceeded max time waiting for response ({str(TIMEOUT)}). Did leader crash?")
result = self.result_queue.get_message()
result_json = json.loads(result.read())
result.delete()
print(f"Problem {s3_uri} completed! result is: {json.dumps(result_json, indent=4)}")
if result_json["driver"]["timed_out"]:
done = True
else:
result = (result_json["solver"]["output"]["result"]).lower()
retries = retries + 1
if result == "unsatisfiable" or result == "satisfiable" or retries >= RETRIES_MAX:
done = True
return result_json
def run_problems(self):
results = {}
if os.path.exists(self.json_file) and not self.clean_first:
with open(self.json_file, "r") as result_file:
results = json.load(result_file)
for (input_file, s3_uri) in self.s3_problem_store.list_cnf_file_s3_path_pairs():
self.logger.info(f'attempting to solve file: {s3_uri}')
# skip previously solved files (unless 'clean' flag is true)
if input_file in results:
print(f"Problem {s3_uri} is cached from earlier run. Result is: {json.dumps(results[input_file], indent=4)}")
continue
result_json = self.run_one_problem(s3_uri)
# write answer (overwrite existing file)
results[input_file] = result_json
with open(self.json_file, "w") as result_file:
json.dump(results, result_file, indent=4)
def init_argparse() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Run a SATComp solver through all files in a bucket"
)
parser.add_argument(
"-v", "--version", action="version",
version = f"{parser.prog} version 0.1.0"
)
parser.add_argument('--profile', required = True, help = "AWS profile")
parser.add_argument('--problem-queue', required=True, type=str, help='Name of the problem SQS queue (sends jobs to solver) ')
parser.add_argument('--result-queue', required=True, type=str, help='Name of the result SQS queue (receives outputs from solver) ')
parser.add_argument('--s3-bucket', required=True, type=str, help='Name of the s3 bucket')
parser.add_argument('--num-workers', required=True, type=int, help='Number of workers in the cluster')
parser.add_argument('--verbose', type=int, help='Set the verbosity level of output: 0 = ERROR, 1 = INFO, 2 = DEBUG (default: 0)')
parser.add_argument('--clean-first', type=bool, help='Clean the output file prior to run (default: False)')
parser.add_argument('--purge-queues', type=bool, help='Purge queues and wait one minute prior to start?')
parser.add_argument('json_file', help='Path to json file containing results')
return parser
def main():
logger = logging.getLogger("satcomp_log")
try:
parser = init_argparse()
args = parser.parse_args()
if args.verbose and args.verbose > 0:
if args.verbose == 1:
logger.setLevel(logging.INFO)
elif args.verbose >= 2:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.ERROR)
logger.debug('command line arguments: ' + str(args))
session = boto3.Session(profile_name=args.profile)
s3 = session.resource('s3')
# create AWS access objects
problem_queue = SqsQueue.get_sqs_queue_from_session(session, args.problem_queue)
result_queue = SqsQueue.get_sqs_queue_from_session(session, args.result_queue)
s3_problem_store = S3ProblemStore.get_s3_file_system(session, args.s3_bucket, logger)
problem_runner = ProblemRunner(problem_queue, result_queue, s3_problem_store, args, logger)
if args.purge_queues:
logger.info('Purging problem and result queues')
problem_queue.purge()
result_queue.purge()
# recommendation of boto api
time.sleep(60)
problem_runner.run_problems()
except Exception as e:
logger.error(f"Failure during 'run_satcomp_solver'. Error: {str(e)}")
raise
if __name__ == "__main__":
main()

View File

@ -0,0 +1,120 @@
"""
This will contain the Solver implementation that shells out to a a solver
executable and saves the logs
"""
import logging
import os
from json import JSONDecodeError
from arg_satcomp_solver_base.solver.solver import Solver
from arg_satcomp_solver_base.solver.run_command import CommandRunner
from arg_satcomp_solver_base.utils import FileOperations
FAILED_TO_WRITE_PROBLEM_TEXT = "Failed to write problem text to file"
class SolverException(Exception):
"""Solver run failure exception"""
class CommandLineSolver(Solver):
"""Solver implementation that shells out to an executable wrapper for a solver executable and saves the logs"""
solver_command: str
command_runner: CommandRunner
stdout_target_loc: str = "base_container_stdout.log"
stderr_target_loc: str = "base_container_stderr.log"
DEFAULT_TIMEOUT_SECONDS: int = 3600
def __init__(self, solver_command: str):
self.solver_command = solver_command
self.command_runner = CommandRunner(self.stdout_target_loc, self.stderr_target_loc)
self.file_operations = FileOperations()
self.logger = logging.getLogger("CommandLineSolver")
self.logger.setLevel(logging.DEBUG)
def _save_input_json(self, formula_file: str, directory_path: str, workers: list, formula_language: str, solver_argument_list: list, timeout_seconds: str):
input_json = {
"formula_file": formula_file,
"worker_node_ips": list(map(lambda w: w["nodeIp"], workers)),
"formula_language": formula_language,
"solver_argument_list": solver_argument_list,
"timeout_seconds": timeout_seconds,
}
try:
self.file_operations.write_json_file(os.path.join(directory_path, "input.json"), input_json)
except OSError as e:
self.logger.exception(e)
raise SolverException(FAILED_TO_WRITE_PROBLEM_TEXT)
def _run_command(self, cmd: str, arguments: list, output_directory: str, timeout: int):
"""Run a command as a subprocess and save logs"""
cmd_list = [cmd]
if arguments is not None:
cmd_list.extend(arguments)
process_result = self.command_runner.run(cmd_list, output_directory, timeout)
return process_result
def _get_solver_result(self, request_directory_path):
try:
raw_solver_result = self.file_operations.read_json_file(os.path.join(request_directory_path, "solver_out.json"))
except FileNotFoundError:
self.logger.error("Solver did not generate output JSON")
except JSONDecodeError as e:
self.logger.error("Solver output not valid json")
self.logger.exception(e)
else:
return raw_solver_result
# If we cannot read the solver result, that is not an error state
# since it is user provided and may not work
return None
def solve(self, formula_file: str, request_directory_path: str, workers: list, task_uuid: str, timeout: int, formula_language: str, solver_argument_list: list):
"""Solve implementation that shells out to a subprocess"""
self._save_input_json(formula_file, request_directory_path, workers, formula_language, solver_argument_list, timeout)
if not timeout:
timeout = CommandLineSolver.DEFAULT_TIMEOUT_SECONDS
try:
process_result = self._run_command(self.solver_command, [request_directory_path],
request_directory_path, timeout)
except FileNotFoundError:
self.logger.error(f"Failed to execute solver script at path: {self.solver_command}. "
f"Solver executable does not exist")
raise SolverException(f"Failed to execute solver script at path: {self.solver_command}. "
f"Solver executable does not exist")
solver_result = self._get_solver_result(request_directory_path)
self.logger.info(solver_result)
if solver_result is None:
task_state = {
"status": "FAILED",
"message": "Error retrieving solver result"
}
else:
task_state = process_result.get("task_state")
solver_runtime_millis = None
run_time_ns = process_result.get("run_time_ns")
if run_time_ns:
solver_runtime_millis = round(process_result.get("run_time_ns") / 1000000)
"""
Our output includes the stdout and stderr logs of the driver (the /satcomp/solver executable),
as well as whatever JSON output is provided once that solver finishes.
"""
return {
"task_id": task_uuid,
"driver": {
"stdout": os.path.join(request_directory_path, process_result.get("stdout")),
"stderr": os.path.join(request_directory_path, process_result.get("stderr")),
"return_code": process_result.get("return_code"),
"solver_runtime_millis": solver_runtime_millis
},
"solver": {
"output": solver_result,
"request_directory_path": request_directory_path
},
"task_state": task_state
}

View File

@ -0,0 +1,70 @@
import logging
import os
import time
import subprocess
import threading
import signal
from dataclasses import dataclass
from arg_satcomp_solver_base.utils import FileOperations
@dataclass
class CommandRunner:
TIMEOUT_RETURNCODE = -100
stdout_target_loc: str
stderr_target_loc: str
file_operations: FileOperations = FileOperations()
def __init__(self, stdout_target_loc, stderr_target_loc):
self.logger = logging.getLogger("RunCommand")
self.logger.setLevel(logging.DEBUG)
self.stdout_target_loc = stdout_target_loc
self.stderr_target_loc = stderr_target_loc
os.environ['PYTHONUNBUFFERED'] = "1"
def process_stream(self, stream, str_name, file_handle):
line = stream.readline()
while line != "":
self.logger.info(f"{str_name}: {line}")
file_handle.write(line)
line = stream.readline()
def run(self, cmd: list, output_directory: str, time_out: int):
self.logger.info("Running command: %s", str(cmd))
with open(os.path.join(output_directory, self.stdout_target_loc), "w") as stdout_handle:
with open(os.path.join(output_directory, self.stderr_target_loc), "w") as stderr_handle:
try:
start_time = time.perf_counter()
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, start_new_session=True)
stdout_t = threading.Thread(target = self.process_stream, args=(proc.stdout, "STDOUT", stdout_handle))
stderr_t = threading.Thread(target = self.process_stream, args=(proc.stderr, "STDERR", stderr_handle))
stdout_t.start()
stderr_t.start()
return_code = proc.wait(timeout = time_out)
end_time = time.perf_counter()
elapsed = end_time - start_time
timed_out = False
except subprocess.TimeoutExpired:
timed_out = True
elapsed = time_out
self.logger.info("Timeout expired for process. Terminating process group w/sigterm")
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
# wait 10 seconds for a graceful shutdown.
try:
return_code = proc.wait(timeout = 10)
except subprocess.TimeoutExpired:
self.logger.info("Process unresponsive. Terminating process group w/sigkill")
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
return_code = self.TIMEOUT_RETURNCODE
stdout_t.join()
stderr_t.join()
return {
"stdout": self.stdout_target_loc,
"stderr": self.stderr_target_loc,
"return_code": return_code,
"output_directory": output_directory,
"elapsed_time": elapsed,
"timed_out": timed_out
}

View File

@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class Solver:
"""Solver interface"""
def solve(self, formula_file: str, request_directory_path: str, workers: list, task_uuid: str, timeout: int, formula_language: str, solver_argument_list: list):
"""Interface for solve command"""

View File

@ -0,0 +1,100 @@
"""Implementation of classes for interacting with SQS"""
import logging
from botocore.exceptions import ClientError
class SqsQueueException(Exception):
"""Exception for SqsQueue errors"""
class QueueMessage:
"""Class to represent an SQS message"""
def __init__(self, msg, queue_resource):
self.logger = logging.getLogger("QueueMessage")
self.logger.setLevel(logging.DEBUG)
self.queue_resource = queue_resource
self.msg = msg
def read(self):
return self.msg.body
def delete(self):
self.logger.info("Deleting message with receipt handle: %s", self.msg.receipt_handle)
msg_txt = self.msg.body
try:
self.queue_resource.delete_messages(Entries=[
{
'Id': "test",
'ReceiptHandle': self.msg.receipt_handle
},
])
except ClientError as ex:
self.logger.error("Failed to delete message from SQS queue")
self.logger.exception(ex)
raise ex
else:
return msg_txt
class SqsQueue:
"""Class to represent an SQS queue"""
def __init__(self, queue_resource, queue_name):
self.queue_resource = queue_resource
self.queue_name = queue_name
self.logger = logging.getLogger("SqsQueue")
def get_message(self):
"""Get a single message off the queue if it exists"""
try:
self.logger.info("Trying to get message from queue %s", self.queue_name)
messages = self.queue_resource.receive_messages(
AttributeNames=['All'],
MessageAttributeNames=['All'],
MaxNumberOfMessages=1,
WaitTimeSeconds=20,
)
except ClientError as e:
self.logger.error("Failed to get message from SQS queue")
self.logger.exception(e)
raise SqsQueueException(f"Failed to get message from SQS queue {self.queue_name}")
if messages is None or len(messages) == 0:
return None
return QueueMessage(messages[0], self.queue_resource)
def put_message(self, msg):
"""Put a single message on the queue"""
try:
self.logger.info("Trying to put message onto queue %s", self.queue_name)
messages = self.queue_resource.send_message(
MessageBody=msg
)
except ClientError as e:
self.logger.error("Failed to put message on SQS queue")
self.logger.exception(e)
raise SqsQueueException(f"Failed to put message on SQS queue {self.queue_name}")
def purge(self):
try:
self.logger.info(f"Trying to purge queue {self.queue_name}")
self.queue_resource.purge()
except ClientError as e:
self.logger.error(f"Failed to purge queue {self.queue_name}")
self.logger.exception(e)
raise SqsQueueException(f"Failed to purge queue {self.queue_name}")
@staticmethod
def get_sqs_queue(queue_name: str):
import boto3
sqs = boto3.resource('sqs')
queue = sqs.get_queue_by_name(QueueName=queue_name)
return SqsQueue(queue, queue_name)
@staticmethod
def get_sqs_queue_from_session(session, queue_name: str):
import boto3
sqs = session.resource('sqs')
queue = sqs.get_queue_by_name(QueueName=queue_name)
return SqsQueue(queue, queue_name)

View File

@ -0,0 +1,38 @@
import threading
import polling2
from arg_satcomp_solver_base.solver.run_command import CommandRunner
from arg_satcomp_solver_base.task_end_notification.task_end_notifier import TaskEndNotifier
class TaskEndNotificationPoller(threading.Thread):
task_end_notifier: TaskEndNotifier
command_runner: CommandRunner
previous_notification_id: str = None
def __init__(self, task_end_notifier: TaskEndNotifier, command_runner: CommandRunner):
threading.Thread.__init__(self)
self.task_end_notifier = task_end_notifier
self.command_runner = command_runner
def run(self):
while True:
self.wait_for_notification()
self.command_runner.run(cmd=["/competition/cleanup"], output_directory="/tmp", time_out = 10)
@polling2.poll_decorator(check_success=lambda has_notification: has_notification, step=0.5, poll_forever=True)
def wait_for_notification(self):
return self._check_for_notification()
def _check_for_notification(self):
notification_id = self.task_end_notifier.check_for_task_end(self.previous_notification_id)
if notification_id is not None:
self.previous_notification_id = notification_id
return True
return False
@staticmethod
def get_task_end_notification_poller():
task_end_notifier = TaskEndNotifier.get_task_end_notifier()
command_runner = CommandRunner("stdout.log", "stderr.log")
return TaskEndNotificationPoller(task_end_notifier, command_runner)

View File

@ -0,0 +1,57 @@
import time
import uuid
import boto3
class TaskEndNotifier:
def __init__(self, table, notification_expiration_time=5):
self.table = table
self.node_expiration_time = notification_expiration_time
def notify_task_end(self, leader_ip: str):
current_time = int(time.time())
notification_id = str(uuid.uuid4())
self.table.update_item(
Key={
'leaderIp': leader_ip
},
UpdateExpression="set notificationId = :notificationId, lastModified = :lastModifiedVal",
ExpressionAttributeValues={
":notificationId": notification_id,
":lastModifiedVal": current_time
}
)
def check_for_task_end(self, previous_notification_id=None):
expiration_time = int(time.time() - self.node_expiration_time)
filter_expression = "#lastModifiedKey > :lastModifiedVal"
attribute_names = {
"#lastModifiedKey": "lastModified"
}
attribute_values = {
":lastModifiedVal": expiration_time
}
if previous_notification_id is not None:
filter_expression += " and not (#notificationId = :notificationId)"
attribute_values[":notificationId"] = previous_notification_id
attribute_names["#notificationId"] = "notificationId"
notifications = self.table.scan(
Select="ALL_ATTRIBUTES",
FilterExpression=filter_expression,
ExpressionAttributeNames=attribute_names,
ExpressionAttributeValues=attribute_values,
ConsistentRead=True
)
if len(notifications["Items"]) > 0:
return notifications["Items"][0]["notificationId"]
return None
@staticmethod
def get_task_end_notifier():
dynamodb_resource = boto3.resource("dynamodb")
table = dynamodb_resource.Table("TaskEndNotification")
return TaskEndNotifier(table)

View File

@ -0,0 +1,60 @@
"""General utils package"""
import json
import logging
import os
import shutil
import uuid
from datetime import datetime
import pytz
class TimeProvider:
def get_current_time(self):
return datetime.now(pytz.utc)
class FileOperations:
def __init__(self):
self.logger = logging.getLogger("FileOperations")
def save_to_file(self, path: str, body: str):
"""Save a string to a file
Raises OSError if unable to open
"""
with open(path, "w", encoding="UTF-8") as file_handle:
file_handle.write(body)
def create_directory(self, task_uuid: str):
return self.create_custom_directory("/tmp/", task_uuid)
def create_custom_directory(self, directory_name, task_uuid: str):
request_directory_path = os.path.join(directory_name, task_uuid)
os.mkdir(request_directory_path)
return request_directory_path
def read_json_file(self, path: str):
with open(path) as solver_out_handle:
return json.loads(solver_out_handle.read())
def write_json_file(self, path: str, item: dict):
return self.save_to_file(path, json.dumps(item))
def read_log_file(self, file_path, max_file_length=None):
try:
with open(file_path, 'rb') as f:
if max_file_length is not None:
return f.read(max_file_length).decode("UTF-8")
return f.read().decode("UTF-8")
except FileNotFoundError as e:
self.logger.error("Could not open log file %s", file_path)
self.logger.exception(e)
return None
def generate_uuid(self):
return str(uuid.uuid4())
def remove_directory(self, path: str):
if os.path.exists(path):
shutil.rmtree(path)

View File

@ -0,0 +1,72 @@
"""
This module runs in the background of the Worker Base Container.
It will read worker node status from saved json file and updated manifest.
"""
import json
import logging
import os
import socket
import threading
import uuid
import time
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest, NodeType
from arg_satcomp_solver_base.utils import FileOperations
class WorkerTimeoutException(Exception):
pass
class WorkerStatusChecker(threading.Thread):
"""Thread that runs in the background trying to get status of worker and
updates manifest every second. Throws an exception if status is not updated for given expiration time"""
worker_poll_sleep_time = 1
file_operations: FileOperations = FileOperations()
def __init__(self, node_manifest: DynamodbManifest, expiration_time: str, path: str):
threading.Thread.__init__(self)
self.node_manifest = node_manifest
self.logger = logging.getLogger("WorkerPoller")
self.logger.setLevel(logging.DEBUG)
self.expiration_time = expiration_time
self.path = path
def get_worker_info(self):
input = os.path.join(self.path, "worker_node_status.json")
return self.file_operations.read_json_file(input)
def update_manifest(self, uuid: str, local_ip_address: str, status: str):
self.logger.info(f"Giving node heartbeat for node {uuid} with ip {local_ip_address}")
self.node_manifest.register_node(str(uuid), local_ip_address, status, NodeType.WORKER.name)
def run(self):
local_ip_address = socket.gethostbyname(socket.gethostname())
node_uuid = uuid.uuid4()
self.logger.info(f"Registering node {node_uuid} with ip {local_ip_address}")
while True:
try:
current_timestamp = int(time.time())
self.logger.info("Trying to get worker node status")
worker_status_info = self.get_worker_info()
status = worker_status_info.get("status")
self.logger.info(f"Worker status is {status}")
worker_timestamp = worker_status_info.get("timestamp")
self.logger.info(f"Worker status updated time: {worker_timestamp}")
if worker_timestamp < current_timestamp - self.expiration_time:
raise WorkerTimeoutException("Timed out waiting for worker node status to update")
self.update_manifest(node_uuid, local_ip_address, status)
self.logger.info("#######################")
time.sleep(self.worker_poll_sleep_time)
except FileNotFoundError:
self.logger.error("worker_node_status file is not generated")
except json.JSONDecodeError as e:
self.logger.error("Worker status file is not valid Json")
self.logger.exception(e)

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""This file is the main entrypoint for the Worker Node Base Container"""
import logging
import os
import subprocess
import sys
from time import sleep
sys.path.append('/opt/amazon/lib/python3.8/site-packages/')
from arg_satcomp_solver_base.node_manifest.dynamodb_manifest import DynamodbManifest
from arg_satcomp_solver_base.worker.worker import WorkerStatusChecker
from arg_satcomp_solver_base.task_end_notification.task_end_notification_poller import TaskEndNotificationPoller
from arg_satcomp_solver_base.utils import FileOperations
logging.getLogger("botocore").setLevel(logging.CRITICAL)
logging.getLogger("boto3").setLevel(logging.CRITICAL)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
class PollerTimeoutException(Exception):
pass
def run_worker(path):
cmd = os.path.join(path, "worker")
stdout = os.path.join(path, "worker_stdout.log")
stderr = os.path.join(path, "worker_stderr.log")
with open(stdout, "wb") as out, open(stderr, "wb") as err:
subprocess.Popen(cmd, stdout=out, stderr=err)
def write_leader_node_json(leader_nodes, dir):
file_operations: FileOperations = FileOperations()
output = os.path.join(dir, "leader_node_status.json")
## write most recently reporting leader
leader_nodes.sort(key = lambda a: a['lastModified'], reverse=True)
node_to_write = leader_nodes[0]
## get rid of decimal; incompatible with json writer
node_to_write['lastModified'] = int(node_to_write['lastModified'])
return file_operations.write_json_file(output, node_to_write)
## Small hack to support getting leader IP node by workers.
## Waits up to 10 minutes.
def get_current_leader_nodes(node_manifest, logger):
poll_sleep_time = 1
poll_timeout = 600
leader_nodes = []
wait_time = 0
logger.info(f"Waiting for 1 leader node")
while True:
leader_nodes = node_manifest.get_all_ready_leader_nodes()
if len(leader_nodes) >= 1:
break
sleep(poll_sleep_time)
wait_time += poll_sleep_time
if wait_time > poll_timeout:
raise PollerTimeoutException("Timed out waiting for leader to report. "
"Zero leaders reported")
logger.info(f"Leaders reported: {leader_nodes}")
return leader_nodes
if __name__ == "__main__":
logger = logging.getLogger("Worker Entrypoint")
dir = "/competition"
logger.info("Getting node manifest")
node_manifest = DynamodbManifest.get_dynamodb_manifest()
logger.info("Writing leader IP to json file")
leaders = get_current_leader_nodes(node_manifest, logger)
write_leader_node_json(leaders, dir)
logger.info("Running worker script from participant")
run_worker(dir) # Run script to save status to work_node_status.json
sleep(1) # Wait for creation of work_node_status.json file
logger.info("Starting worker status checker")
worker_status = WorkerStatusChecker(node_manifest, 5, dir)
worker_status.start()
task_end_notification_poller = TaskEndNotificationPoller.get_task_end_notification_poller()
task_end_notification_poller.start()

View File

@ -0,0 +1,15 @@
FROM satcomp-infrastructure:common
RUN apt-get install psmisc
RUN pip3 install flask waitress boto3 pytz polling2
RUN ls
COPY --chown=ecs-user satcomp-solver-resources/ /opt/amazon/
COPY --chown=ecs-user satcomp-worker/resources/worker /competition/worker
COPY --chown=ecs-user satcomp-worker/resources/cleanup /competition/cleanup
ENV PYTHONPATH=/opt/amazon:.:
RUN chmod u+x /opt/amazon/arg_satcomp_solver_base/worker_entrypoint.py
RUN chmod u+x /competition/worker
RUN chmod u+x /competition/cleanup
RUN service ssh start
USER ecs-user
EXPOSE 22
ENTRYPOINT /usr/sbin/sshd -D -f /home/ecs-user/.ssh/sshd_config & /opt/amazon/arg_satcomp_solver_base/worker_entrypoint.py

View File

@ -0,0 +1,3 @@
#!/bin/bash
echo "Cleaning up: Killing all mpi processes"
pkill -9 mpi

View File

@ -0,0 +1,18 @@
#!/usr/bin/env python3
""" This is a test script that saves worker status to json file every second"""
import json
import time
def update_worker_status():
while True:
data = {"status": "READY", "timestamp": int(time.time())}
with open(
"/competition/worker_node_status.json",
"w",
) as statusfile:
statusfile.write(json.dumps(data))
time.sleep(1)
if __name__ == "__main__":
update_worker_status()

View File

@ -0,0 +1,530 @@
# Configuring AWS Infrastructure for SAT/SMT-Comp
This README describes how to build and configure AWS infrastructure for the SAT and SMT parallel and cloud tracks. If you are only competing in the parallel track, a handful of the steps are unnecessary. We will point out these cloud-only steps as we go.
You will proceed through the following steps.
* [Mallob Quickstart](#mallob-quickstart). This will run through all the steps using just two scripts so that you can see an example cloud solver up and running quickly. After we see it working, then we will explain the steps in more detail.
* [Managing AWS Solver Infrastructure](#managing-solver-infrastructure). This section describes the infrastructure creation process in more detail, and how you can switch between parallel and cloud configurations for testing.
* [Preparing Docker Images for Upload to AWS](#preparing-docker-images). This section describes how to upload your solver to AWS.
* [Storing Analysis Problems in the Cloud](#storing-analysis-problems-in-the-cloud). This section describes how we store problems for your solver in S3, the main storage service for AWS.
* [Running your Solver in the Cloud](#running-your-solver). This section describes how we run your solver in the cloud.
Additional resources are available in a FAQ:
* [FAQ/Troubleshooting](#faq--troubleshooting)
Although there is a lot of information about how to build with AWS infrastructure here, most of the development of your solver can be performed using Docker locally on your laptop as described in the [Solver Development README](../docker/README-Solver-Development.md) (even for the cloud track). You only need to involve AWS when you want to look at performance testing on the competition hardware.
## Prerequisites
To use the AWS infrastructure, you will need the following tools installed:
- [python3](https://www.python.org/). To install the latest version for your platform, go to the [downloads](https://www.python.org/downloads/) page.
- [docker](https://www.docker.com/). There is a button to download Docker Desktop on the main page.
- [boto3](https://aws.amazon.com/sdk-for-python/). Once you have python3 installed (above), you can install this with `pip3 install boto3`.
Basic knowledge of AWS accounts and services is helpful, but we will walk you through all of the necessary steps.
We recommend that your development environment be hosted on Mac OS Monterey (v12.6), Amazon Linux 2 (AL2) or Ubuntu 20. Other platforms may work, but have not been tested.
### Installing the AWS CLI
In order to work with AWS, you must install the [AWS CLI](https://aws.amazon.com/cli/) for your platform. To get started, follow the directions for your operating system here:
[https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
If you have previously installed the AWS CLI, you should ensure it is up to date.
### Creating AWS Credentials for your AWS Account
You must configure AWS credentials to access the resources in your account. For the purposes of simplifying the competition configurations, you will use a so-called _root level access key_. This is _not_ the best practice for security (which would be to create a separate user in your account) but it will suffice for the competition (see FAQ for more information).
In this section, you will be working with the Identity and Access Management (IAM) console. To create a root level access key go to the [IAM Console](https://console.aws.amazon.com/iamv2/). You can get to the console by clicking the previous link or by searching for "IAM" in the search field at the top of the [AWS Console](https://console.aws.amazon.com) as shown [here](readme-images/iam-search.png) and then clicking the resulting link).
On the IAM page, click "My Security Credentials" on the right side of the IAM console page as shown [here](readme-images/iam-quicklinks.png). Next, click on "Access keys (access key ID and secret access key)," then "Create New Access Key," and then "Show Access Key."
This will create an Access Key ID and a Secret Access Key. Copy these for use in the next step.
Next, on your local workstation, create a `~/.aws/credentials` file containing the following text:
[default]
aws_access_key_id=ACCESS_KEY_ID
aws_secret_access_key=SECRET_ACCESS_KEY
region=us-east-1
`ACCESS_KEY_ID` and `SECRET_ACCESS_KEY` are the keys that you created in the previous step.
After installing the AWS CLI and creating credentials, check your configuration by attempting to run an AWS command. An example command that should work is:
```text
aws sts get-caller-identity
```
If you receive an error message, see the [FAQ/Troubleshooting](#faq--troubleshooting) section at the bottom of this document.
We recommend that when you've completed account set-up, you follow the steps in the FAQ [How Do I Track My Spending?](#how-do-i-track-my-spending) to track your spending. For now, we'll continue with the setup.
## Mallob Quickstart
To allow you to see a solver up and running quickly, we have created two commands to provide a quick start to getting a Mallob container running in AWS. The first command: `quickstart-build` creates the AWS infrastructure for a cloud solver, builds the Mallob docker images, and uploads them to AWS. The second command: `quickstart-run` creates a cluster to run with four nodes (1 leader and 3 workers); if it is not already running, waits for the cluster to be ready, then submits a solving job and reads the response from the output queue. After you test out the mallob solver using these commands, you should go through the rest of the readme to see the steps that they are performing and how to do this with your own solver.
### Quickstart Build
This command creates an account, builds Mallob (if not already built), and uploads the Docker images to AWS. The form is simply:
```text
./quickstart-build
```
This command will require 10-20 minutes to run. Once it is complete, Mallob should be ready to run, and the infrastructure should be set up to run a cloud solver.
### Quickstart Run
This command will spin up a cluster, submit a job to run called `test.cnf` that we uploaded to a storage location in S3 (the AWS storage service), and wait for the result, then spins down the cluster. The form of the command is:
```text
./quickstart-run
```
Once the solver is completed, you should see a message back from the solver that the problem was completed and that the result was SATISFIABLE. Later on we will show you how to monitor a run using an AWS service called _CloudWatch_ and extract the intermediate files produced by the run, including the stdout and stderr logs, using an AWS service called _S3_.
Voila!
Now we walk through the different steps in more detail.
## Managing Solver Infrastructure.
The `quickstart-build` command from the previous section creates the AWS infrastructure necessary to run your solver. In that creation process, the infrastructure is set up to run cloud (distributed) solvers. If you want to switch to hosting a parallel solver, run the update command described in the next section. If something goes wrong, then delete the infrastructure and re-create it as described in [Deleting and Re-creating the Infrastructure](#deleting-and-re-creating-the-infrastructure).
### Switching Between Cloud and Parallel Tracks
If you ran the Mallob `quickstart-build`, then you have already created your solver infrastructure and don't have to do any further work. In that creation process, the infrastructure is set up for cloud (distributed) solvers. Once the infrastructure has been created, if you want to update it to switch between machine configurations for the cloud and parallel tracks, you can run the `update-solver-infrastructure` script:
```text
./update-solver-infrastructure --solver-type SOLVER_TYPE
```
where:
* `SOLVER_TYPE`: is either `cloud` or `parallel` depending on which kind of solver you are running. Note that we will run cloud solvers run on multiple 16-core machines ([m6i.4xlarge](https://aws.amazon.com/ec2/instance-types/)) with 64GB memory, while parallel solvers run on a single 64-core machine ([m6i.16xlarge](https://aws.amazon.com/ec2/instance-types/)) with 256GB memory.
This will change the instance type and memory configurations for the ECS images used. You can switch back and forth as needed.
### Deleting and Re-creating the Infrastructure
If something goes wrong and you want to start over, simply run the `delete-solver-infrastructure` script:
```text
./delete-solver-infrastructure
```
This will delete the infrastructure and associated resources. **Note that this deletion includes any files that have been uploaded to your S3 bucket and also any ECR docker images that you have created.** It will not delete your AWS account or security credentials.
Next, you will create the AWS infrastructure necessary to build and test solvers. The SAT and SMT competitions both use the following infrastructure elements. These should "just work", but there is more information about the different parts of the infrastructure in the FAQ. To set up your resouces, simply run the `create-solver-infrastructure` script that we have provided.
```text
./create-solver-infrastructure --solver-type SOLVER_TYPE
```
where:
* `SOLVER_TYPE`: is either `cloud` or `parallel` depending on which kind of solver you are running. Note that we will run cloud solvers run on multiple 16-core machines ([m6i.4xlarge](https://aws.amazon.com/ec2/instance-types/)) with 64GB memory, while parallel solvers run on a single 64-core machine ([m6i.16xlarge](https://aws.amazon.com/ec2/instance-types/)) with 256GB memory.
The script will take 5-10 minutes. When complete, all of the needed cloud resources should be created. The script polls until the creation is complete.
## Preparing Docker Images
### Creating the Base Docker Leader and Worker Images for Solvers
See the [SAT-Comp Docker Images README.md file](../docker/README.md) in the `docker` directory for instructions on how to build and test Docker images.
### Storing Solver Images in the ECR repository
Amazon stores your solver images in the [Elastic Container Registry (ECR)](https://console.aws.amazon.com/ecr).
The `create-solver-infrastructure` command described earlier creates an ECR repository with the same name as the project (comp23).
This repository will store the images for the leader and worker. Once you have created and tested a docker image (or images, for the cloud leader and worker) as described in the [SAT-Comp Docker Images README.md file](../Docker/README.md), you can upload them to your AWS account with the `ecr-push` script:
```text
./ecr-push [--leader LEADER_IMAGE_TAG] [--worker WORKER_IMAGE_TAG]
```
where:
* `LEADER_IMAGE_TAG` is the tagged name of the leader docker image (e.g. satcomp-mallob:leader).
* `WORKER_IMAGE_TAG` is the tagged name of the leader docker image (e.g. satcomp-mallob:worker).
The leader and worker tags are optional; you can upload one or both docker images with this command (though if neither is specified, the script exits with an error).
## Storing Analysis Problems in the Cloud
We use the AWS Simple Storage Service (S3) to store the analysis problems we want to solve with our solver. S3 has a concept of a "bucket" which acts like a filesystem. As part of the `create-solver-infrastructure` CloudFormation script, we have created a bucket for you where you can store files: `ACCOUNT_NUMBER-us-east-1-comp23`, and added a `test.cnf` file to this bucket for testing. You can start with the `test.cnf` example and skip the rest of this section until you wish to add additional files or buckets for testing your solver.
You can copy files to the bucket with a command similar to this one (when executed from the root directory of this repository, this re-copies the `my-problem.cnf` file to the default bucket):
```text
aws s3 cp my-problem.cnf s3://ACCOUNT_NUMBER-us-east-1-comp23
```
When `s3 cp` is complete, you will see your file(s) in the list of objects in the bucket:
```text
aws s3 ls s3://ACCOUNT_NUMBER-us-east-1-comp23
```
More information on creating and managing S3 buckets is found here: [https://aws.amazon.com/s3/](https://aws.amazon.com/s3/). The S3 command line interface is described in more detail here: [https://docs.aws.amazon.com/cli/latest/userguide/cli-services-s3-commands.html](https://docs.aws.amazon.com/cli/latest/userguide/cli-services-s3-commands.html).
## Running Your Solver
After storing docker images: `comp23:leader` (and for the cloud solver: `comp23:worker`) and placing at least one query file in your S3 bucket, you are ready to run your solver.
### Running Your Solver Using `quickstart-run`
You can use `quickstart-run` to spin up the cluster, run several files, and then spin down the cluster with one command. To run files other than the `test.cnf` file, you can provide a list of s3 file paths to run, one after the other. The command looks like:
```text
quickstart-run [--s3-locations S3_LOCATIONS [S3_LOCATIONS ...]]
```
In this case, after creating the cluster, it will try to run all files before spinning the cluster back down. This allows you a relatively safe way to run several jobs. **N.B.**: Keep an eye on it, though! If your solver fails while solving and does not generate an output, then the script will poll forever waiting for the result message from your solver, and will not spin down the cluster. In this case, follow the cleanup procedure below.
### Running Your Solver Using Basic Scripts
If you would like to have finer-grained control, choosing when to spin up and down the cluster, or building your own scripts, you can use the following steps:
1. Setup.
a. Setting capacity for the cluster in [EC2](https://aws.amazon.com/ec2/).
b. Setting the desired number of workers in [ECS](https://aws.amazon.com/ecs/).
2. Run. submitting a solve job to the SQS queue by specifying the S3 location of the problem and the desired number of workers.
3. Cleanup. Setting EC2/ECS capacity for cluster back to zero.
After setup, you can run any number of solve jobs before cleaning up.
Note: It is very important that you [clean up](#cluster-teardown) once you are done. EC2 nodes that remain active continue to be charged against your account.
### Cluster Setup
To set up and tear down the ECS cluster, we have provided a script called `ecs-config`. You provide the number of desired worker nodes as an argument. The script:
1. Creates an EC2 fleet large enough to run the experiment (number of workers + a leader node)
2. Creates the correct number of tasks in the ECS service for the leader and workers.
To run the script:
```text
ecs-config setup --workers [NUM_WORKERS]
```
where:
* `NUM_WORKERS` is the number of worker nodes you want to allocate. To avoid problems with possible resource limits for your AWS account, we recommend that you set `NUM_WORKERS` to `1` when working through this document for the first time. If you are building for the parallel track, set `NUM_WORKERS` to `0`, as all solving will be performed by the leader node.
AWS typically requires 2-5 minutes to allocate nodes and host the ECS cluster. You can monitor the creation process by navigating to the [ECS console](https://us-east-1.console.aws.amazon.com/ecs) and selecting the _SatCompCluster_ link. [FIXME: link won't work outside us-east-1.]
The next page will show a list of job queues, including:
```text
job-queue-comp23-SolverLeaderService-...
job-queue-comp23-SolverWorkerService-...
```
The service is running and available when the number of running tasks for the leader is `1` and the number of running tasks for the Worker service is `n`, as chosen with `NUM_WORKERS` in the `ecs-config` script argument.
Note: Your AWS account is charged for the number of EC2 instances that you run, so your bill will rise with the number of workers that you allocate.
### Job Submission and Execution
Before submitting a job, check that a cluster is set up and running, and the desired problem is available in an accessible S3 bucket.
Solver jobs are submitted by sending SQS messages. We have provided a `send_message` script to do this for you. You provide the S3 location of the file to run and number of desired worker nodes. The script submits an SQS request to the `ACCOUNT_NUMBER-us-east-1-SatCompQueue` queue, which the solver leader container is monitoring.
To run the script:
```text
send_message --location S3_LOCATION --workers NUM_WORKERS [--timeout TIMEOUT] [--name NAME] [--format FORMAT] [--args ARGS]
```
with required arguments:
* `S3_LOCATION` is the S3 location of the query file. For example, for the bucket we described earlier, the location would be `s3://ACCOUNT_NUMBER-us-east-1-comp23/test.cnf`.
* `NUM_WORKERS` is the number of worker nodes to allocate for this problem. Again, we recommend that you start with `NUM_WORKERS` as `1` when beginning. For parallel solvers, you should set `NUM_WORKERS` to `0`.
and optional arguments:
* `TIMEOUT` is an optional parameter that sets the timeout in seconds for the solver. Defaults to 60s.
* `NAME` is an optional parameter providing a name for the solver (in case you want to host multiple solvers within one container. Defaults to the empty string)
* `FORMAT` is an optional parameter providing the problem format. Defaults to the empty string.
* `ARGS` is an optional parameter allowing one or more arguments to be passed to the solver. Defaults to the empty list.
The leader container will pull the message off of the queue, fetch the problem from S3, and begin execution. For more information on the steps performed, see [Extending the Solver Base Container](#understanding-the-solver-architecture-and-extending-the-competition-base-leader-container) below.
### Debugging on the Cloud: Monitoring and Logging
Debugging takes more effort on the cloud than on your local machine, because it takes time to upload your solver and deploy it, and observability can be reduced. Debug first at small scale on your local machine as described in the [Solver Development README](../docker/README-Solver-Development.md).
To debug on the cloud, there are two main mechanisms that we use to examine our solvers:
1. CloudWatch log groups, which allow a real-time view of stdout and stderr from your solver.
1. S3 storage of intermediate files, which allows you to dump files that you can inspect after the run.
**CloudWatch Log Groups:** To watch the logs in real-time you can navigate to the CloudWatch console, then choose the "log groups" item on the left side menu as shown in Figure 2.
![](readme-images/cloudwatch-menu.png)
_Figure 2: CloudWatch log groups menu_
After choosing the log groups menu item, you should see logs related to `/ecs/comp23-leader` and `/ecs/comp23-worker` as shown in Figure 3.
![](readme-images/cloudwatch-log-groups.png)
_Figure 3: CloudWatch log groups view_
These logs will capture both stdout and stderr from your solver containers. The logs are captured in something called _streams_, usually tied to a specific instance (so, for a cluster you will have one stream for the leader and one for each worker). Note that if you don't use a solver for a while, CloudWatch will close the stream and open a new one, as you see in Figure 4:
![](readme-images/cloudwatch-log-streams.png)
_Figure 4: CloudWatch log groups view_
The logs themselves are visible as lists of events as shown in Figure 5:
![](readme-images/cloudwatch-log-items.png)
_Figure 5: CloudWatch log items view_
AWS provides a very full-featured log query language called [CloudWatch Logs Insights](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/AnalyzingLogData.html) (visible in the menu in Figure 2) that you can use to search for specific patterns within logs that can often help speed up your debugging. A full discussion of insights is outside the scope of this document, but the link above leads to a full set of documentation about how use it.
**S3 Storage of Intermediate Files:**
When the solver is invoked we store the `input.json` file and the analysis problem in a temporary directory that is passed as the argument to the solver. We recommend that you use the same directory (or create a subdirectory) for storing all of the intermediate files used during solving. At the conclusion of solving, all contents in this directory _for the leader solver_ will be uploaded to your S3 bucket (`ACCOUNT_NUMBER-us-east-1-comp23`) under a `/tmp/UUID` directory with a unique UUID so that you can inspect them offline. This way, you can instrument your solvers to write to the filesystem for later analysis. **N.B.:** currently this is only available for the leader solver; however, we may add the capability to upload data from the workers depending on customer feedback. If you store the stdout and stderr logs to this directory (as the Mallob example does), then the output message produced by the solver will identify the UUID used for the bucket for easy reference.
### Cluster Teardown
After testing, remember to tear down the cluster by running:
```text
ecs-config shutdown
```
This will terminate all EC2 instances and reset the number of leaders and workers to zero in the ECS service.
You should verify that no EC2 instances are running after running the teardown script. Navigate to the [EC2 console](https://console.aws.amazon.com/ec2) and check the `Instances` tab. There should be no running instances. Note that it might take a minute or two for the instances to shut down after running the script.
Note: You are responsible for any charges made to your AWS account. While we have tested the shutdown script and believe it to be robust, you are responsible for monitoring the EC2 cluster to make sure resources have shutdown properly. We have alternate directions to shutdown resources using the console in the Q&A section.
## FAQ / Troubleshooting
#### How Do I Track My Spending?
[AWS CloudFormation](https://aws.amazon.com/cloudformation/) can be used to automate many cloud configuration activities. The file `setup-account.yaml`[setup-account.yaml] in this repository is an optional CloudFormation script that can help you track your spending. It sets up notification emails to be sent via email when your account reaches 20%, 40%, 60%, 80%, 90%, 95%, and 100% of the monthly account budget. This will provide a window into the current spend rate for building and testing your solver.
Here is the command to run it:
aws cloudformation create-stack --stack-name setup-account-stack --template-body file://setup-account.yaml --parameters ParameterKey=emailAddress,ParameterValue=YOUR_EMAIL_ADDRESS
_Note_: Be sure to double check the email address is correct!
After running the aws cloudformation command, you can monitor the installation process by logging into your AWS account and navigating to the [CloudFormation console](https://console.aws.amazon.com/cloudformation).
Make sure you are in the us-east-1 region (You can select the region in the top right corner of the console page).
You should see a stack named `setup-account-stack`.
![](readme-images/cloudformation.png)
_Figure 1: Cloudformation result_
By clicking on this stack, and choosing `events`, you can see the resources associated with the stack. After a short time, you should see the `CREATE_SUCCEEDED` event. If not (e.g., the email address was not valid email address syntax), you will see a `CREATE_FAILED` event. In this case, delete the stack and try again. If you have trouble, email us at: [sat-comp@amazon.com](mailto:sat-comp@amazon.com) and we will walk you through the process.
Although it is handy to get emails when certain account budget thresholds have been met, it is both useful and important to check by-the-minute account spending on the console: [https://console.aws.amazon.com/billing/home](https://console.aws.amazon.com/billing/home).
#### I'm only submitting to the parallel track and not the cloud track. Do I need a worker image?
No. If you are submitting to the parallel track only, you do not need a worker image. For the parallel track, we will assume that the leader manages all threading and communications within the single (multi-core) compute node.
#### How do I manage my account after SAT-Comp if I want to follow security best practices?
If you continue using your AWS account after the competition, we recommend that you follow AWS best practices as described here:
[https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html#create-iam-users](https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html#create-iam-users)
#### What are the sequence of steps performed by the leader container to set up the `input.json' file?
The leader infrastructure performs the following steps:
1. Pull and parse a message from the `ACCOUNT_NUMBER-us-east-1-SatCompQueue` SQS queue with the format described in the [Job Submission and Execution section](../infrastructure/README.md#fixme).
1. Pull the appropriate solver problem from S3 from the location provided in the SQS message.
1. Save the solver problem on a shared EFS drive so that it can also be accessed by the worker nodes.
1. Wait until the requested number of workers have reported their status as READY along with their IP addresses.
1. Create a working directory for this problem.
1. Create and write an `input.json` with the IP addresses of the worker nodes as well as the location of the problem to the working directory
1. Invoke the executable script located at path `/competition/solver` with a single parameter: the path to the working directory. The solver script will look for the `input.json` file in the working directory. It is also the location where solver output and error logs will be written.
1. The return code for `/competition/solver` will determine the expected result for the solver: A return code of 10 indicates SAT, 20 indicates UNSAT, 0 indicates UNKNOWN, and all other return codes indicate an error.
1. Upon task completion, notify all workers that the task has ended.
#### I ran the infrastructure scripts, and they seemed to work, but when I go out to the console, I don't see any of the infrastructure: S3 buckets, ECS Queues, etc. What happened?
The most common mistake people make when starting with AWS is not choosing the correct *region* for executing their jobs. In the console on the top right there is a selectable region:
<img src="readme-images/regions.png" alt="image of region selector" width="200"/>
Make sure that the region selected in the top-right of the console page is `us-east-1`.
#### I created a leader node and submitted a job, but the leader keeps saying it is waiting for worker nodes and not running my solver. What happened?
There are two likely causes. First it could be that you submitted an SQS that asked for more workers than you configured when you set up your cluster. Make sure that the number of worker nodes equals or exceeds the number of worker nodes that were requested in the SQS message. Second, if you just set up the ECS cluster, then it is likely that either the requested EC2 servers have not yet initialized, or that the ECS cluster has not yet started running on them. It takes 2-5 minutes for the resources to become available after they are requested.
There is also a less-likely cause, involving capacity limits. EC2 sets default capacity limits on the number of compute nodes that are available to an account. If you are running large-scale tests, it may be that you have exceeded the default number of nodes. To check, navigate to the EC2 console for the account, and click on "Limits" on the left side. Type "All standard" in the search bar and determine the current limit. The limit is specified in terms of vCPUs, so each m6i-4xlarge image uses 16 vCPUs, and every m6i-16xlarge uses 64 CPUs. If you need additional vCPUs, click on the "All Standard" link and request a limit increase. **N.B.** Running a large cluster can become expensive. Make sure you limit your large-scale testing to preserve your AWS credits.
#### I'm watching the ECS cluster, but nothing is happening; no nodes are being created. What happened?
This can happen if you try to start up a cloud solver when the infrastructure is configured for parallel execution or vice versa. In this case, the machine image where you want to deploy does not match the requirements of the ECS task. Make sure that if you are running a cloud solver, you have not configured the infrastructure for parallel. If you are unsure, you can always run `update-solver-infrastructure` with the expected type. If the infrastructure is already set up for this type, it will be a no-op.
#### Suppose I want to use the console to setup my ECS clusters, or to monitor them. How do I do that?
__Step 1:__ Update the size of the EC2 cluster using the EC2 console
To control the instances in your cluster, go to the EC2 console and scroll down on the left side of the console and click on the link that says "Auto Scaling Groups".
In the next page, you will see an autoscaling group called something like `job-queue-comp23-EcsInstance`.
1. Select the queue by clicking on it, then click the "Edit" button in the "Group details" section.
1. Set the desired, and maximum task capacity to n (where n includes 1 leader and n-1 workers, so there is a minimum useful size of 2), and click "Update".
__Step 2:__ Update the size of the ECS task pool for the leader and workers
Navigate to the ECS console, then select the SatCompCluster link.
The next page should show a list of job queues, including:
```text
job-queue-comp23-SolverLeaderService-...
job-queue-comp23-SolverworkerService-...
```
Click on the SolverLeaderService job queue and choose "Update". Set the number of tasks to 1, then choose "Skip to review", then click "Update Service". You should then navigate back to the SatCompCluster link and click on the SolverWorkerService link. Choose "Update" and set the number of tasks to n-1, where 'n' is the number of EC2 nodes you created in Setup Step 1.
Please allow 2-3 minutes for the instances to boot up and register with the cluster. When registered, you should see 1 desired and 1 running task for the leader and n-1 desired tasks for the worker in the SatCompCluster screen.
If you get the following error, it means that your instances have not yet booted up:
```text
An error occurred (InvalidParameterException) when calling the RunTask operation: No Container Instances were found in your cluster.
```
If they fail to boot up after 5 minutes, please verify that both Desired Capacity and Maximum Capacity are set correctly.
You incur costs for the time the cluster is running.
#### Suppose I want to use the console to tear down my ECS clusters. How do I do that?
To control the instances in your cluster, go to the EC2 console and scroll down on the left side of the console and click on the link that says "Auto Scaling Groups".
In the next page, you will see an autoscaling group called something like job-queue-comp23-EcsInstance.
1. Select the queue by clicking on it, then click the "Edit" button in the "Group details" section.
1. Set the desired and maximum task capacity to 0. This shuts down any EC2 instances.
#### What are the various AWS services that are used in the infrastructure?
The infrastructure pieces are as follows:
* Compute resources necessary to host solvers, provided by the [Elastic Compute Cloud](https://aws.amazon.com/ec2/) (EC2) service.
* A registry for solver Docker image files, provided by the [Elastic Container Registry](https://aws.amazon.com/ecr/) (ECR) service.
* A compute environment to support automated launching of containers on compute resources, provided by the [Elastic Container Service](https://aws.amazon.com/ecs/) (ECS) service.
* A storage location for solver queries that uses the [Simple Storage Service](https://aws.amazon.com/s3/) (S3) service.
* A distributed file system that can be used for communication between leader and worker solvers, using the [Elastic File System](https://aws.amazon.com/efs/) (EFS) service.
* A problem queue that the solver leader uses to extract the next problem to solve, provided by the [Simple Queue Service](https://aws.amazon.com/sqs/) (SQS).
You can click the links about for more specifics on each each of the services.
#### Suppose I want to use the console to send SQS messages to start executing jobs. How do I do that?
Submit a job using the [Simple Queue Service (SQS) console](https://console.aws.amazon.com/sqs/).
1. Navigate to the SQS console, and select the queue named `ACCOUNT_NUMBER-us-east-1-SatCompQueue`.
2. Click the *SendAndReceiveMessages* button
3. Set the message body to something matching the following structure:
```text
{"s3_uri":"s3://PATH_TO_CNF_PROBLEM",
"num_workers": DESIRED_NUMBER_OF_WORKERS}
```
For example, for the bucket we described earlier, given a cluster with two nodes (one worker), it would be:
```text
{"s3_uri":"s3://ACCOUNT_NUMBER-us-east-1-satcompbucket/test.cnf",
"num_workers": 1}
```
#### What if I want to create different buckets other than the one provided for storing problems in?
In case you wish to create a different bucket, here is a command to create a bucket named `comp23-satcomp-examples`:
```text
aws s3api create-bucket --bucket comp23-satcomp-examples
```
Notes:
- This will create the bucket in the AWS region specified in your named profile
- If you chose a different region than `us-east-1`, you might get an `IllegalLocationConstraintException`. To deal with the problem, you have to append the argument `--create-bucket-configuration {"LocationConstraint": "YOUR-REGION-NAME"}` to the command, where `YOUR-REGION-NAME` is replaced with the name of the region you chose.
- S3 buckets are globally unique across all of AWS, which means that the command will fail if someone else is already using the same bucket name. A simple fix is to add the AWS account number to the bucket name, which will likely yield a unique bucket name.
#### I'd like to know more about how the Elastic Container Registry works. Can I do the steps to upload files by hand?
Amazon stores your solver images in the [Elastic Container Registry (ECR)](https://console.aws.amazon.com/ecr).
The `create-solver-infrastructure` command described earlier creates an ECR repository named `comp23`.
This repository store the images for the leader and worker.
The repository has an associated URI (shown on the console page), which is what we use to push docker images to ECR.
The format for URIs is
```text
ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23
```
You will use these URIs to describe where to store our Docker images in AWS.
Note: a good way to make sure that you type these URI names correctly is to navigate to the Elastic Container Registry (ECR) console in AWS and copy them from the page (the names are in the URI column).
To store a Docker image in ECR from your local machine, first you need to login to ECR:
```text
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin AWS\_ACCOUNT\_ID.dkr.ecr.us-east-1.amazonaws.com
```
Next, you need to tag the image to match the ECR repository. For the worker, tag the image as:
```text
docker tag [LOCAL_WORKER_IMAGE_ID] [AWS_ACCOUNT_NUMBER].dkr.ecr.us-east-1.amazonaws.com/comp23:worker
```
where
* **LOCAL\_WORKER\_IMAGE\_ID** is the local worker image tag (e.g., `comp23:worker`).
* **AWS\_ACCOUNT\_ID** is the account ID where you want to store the image.
For the leader, tag the image as:
```text
docker tag LOCAL_LEADER_IMAGE_ID ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:leader
```
where
* **LOCAL\_LEADER\_IMAGE\_ID** is the local image for the leader, and the other parameters are the same as for the worker.
After these steps, you can docker push the images:
```text
docker push ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:leader
docker push ACCOUNT_NUMBER.dkr.ecr.us-east-1.amazonaws.com/comp23:worker
```
You should see network progress bars for both images.
More information related to the ECR procedures is described here: [https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html).
If you have trouble, please email us at: [sat-comp@amazon.com](mailto:sat-comp@amazon.com) and we will walk you through the process.

View File

@ -0,0 +1,2 @@
#!/bin/sh
python3 manage-solver-infrastructure.py --mode create $*

View File

@ -0,0 +1,3 @@
#!/bin/sh
# --solver-type doesn't matter in this case, so we provide it.
python3 manage-solver-infrastructure.py --mode delete --solver-type parallel $*

View File

@ -0,0 +1,114 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
from botocore.exceptions import ClientError, ProfileNotFound
import sys
import subprocess
import re
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Runner")
logger.setLevel(logging.INFO)
class STS:
def __init__(self, sts_client) -> None:
self.sts = sts_client
def get_account_number(self) -> str:
try:
return self.sts.get_caller_identity()["Account"]
except Exception as e:
logger.error(f"Failed to get profile account number: {e}")
raise e
def arg_parser() -> dict:
parser = argparse.ArgumentParser()
parser.add_argument('--profile', required = False, help = "AWS profile")
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
parser.add_argument('--leader', required = False, help = "Docker tag name of the leader container")
parser.add_argument('--worker', required = False, help = "Docker tag name of the worker container (only for cloud track)")
args = parser.parse_args()
# argument checking.
# matcher = re.compile(allowable_project_names_mask)
if not re.match(r"^[a-z]([a-z\d\/_-])*$", args.project):
logger.error(f"Invalid project name '{args.project}'.")
logger.error(f"Project names must start with a lowercase letter and can consist only of lowercase numbers, digits, '/', '_', or '-' characters.")
sys.exit(-1)
if not (args.leader or args.worker):
logger.error(f"Neither leader or worker image specified...nothing to upload!")
sys.exit(-1)
return args
def main() -> None:
args = arg_parser()
try:
session = boto3.Session(profile_name=args.profile)
except ProfileNotFound as e:
logger.error(f"Unable to create AWS session. Profile '{profile}' not found. Please double check that this profile is set up in the ~/.aws/config file")
sys.exit(1)
if not session.region_name:
logger.error(f"Profile does not have a region defined. Please add a region (recommend: us-east-1) to profile '{profile}' in the ~/.aws/config file")
sys.exit(1)
# link to AWS services
sts_client = session.client('sts')
sts = STS(sts_client)
# in order to make the call, I need:
# - the account #
# - the region
# - the profile
# - the project id
# - the leader container
# - the worker container
account_number = sts.get_account_number()
region = session.region_name
project_name = args.project
profile_args = ['-p', args.profile] if args.profile else []
leader_args = ['-l', args.leader] if args.leader else []
worker_args = ['-w', args.worker] if args.worker else []
# run the shell script file to set it up.
cmd = ' '.join(['./ecr-push-internal.sh', \
'-j', project_name, \
'-a', str(account_number), \
'-r', region] + \
profile_args + leader_args + worker_args)
logger.info(f"About to run: { cmd }")
result = subprocess.run(cmd, shell=True)
if result.returncode == 1:
logger.error("The login command to ECR failed. Please double check that you have the correct rights in the profile.")
exit(1)
elif result.returncode == 2:
logger.error(f"The attempt to tag the leader image failed. Please make sure that you have a local leader image associated with {args.leader}.")
exit(2)
elif result.returncode == 3:
logger.error(f"The attempt to upload the leader failed. Please make sure that there is a repository associated with {project_name}.")
exit(3)
elif result.returncode == 4:
logger.error(f"The attempt to tag the worker image failed. Please make sure that you have a local worker image associated with {args.worker}.")
exit(4)
elif result.returncode == 3:
logger.error(f"The attempt to upload the worker failed. Please make sure that there is a repository associated with {project_name}.")
exit(5)
elif result.returncode == 0:
logger.info("Upload succeeded. ")
exit(0)
else:
logger.error(f"Unexpected return code: {result.returncode} from ecr-push-internal.sh; exiting")
exit(6)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,2 @@
#/bin/sh
python3 docker-upload-to-ecr.py $*

View File

@ -0,0 +1,115 @@
# /bin/bash
print_args() {
echo "usage: ecr-push-internal.sh -p PROFILE -j PROJECT -a ACCOUNT -r REGION -l LEADER [-h] [-w WORKER]"
echo " "
echo "required arguments:"
echo " -j, --project PROJECT Name of the project"
echo " -a. --account ACCOUNT Account number"
echo " -r, --region REGION Region"
echo "optional arguments:"
echo " -p, --profile PROFILE AWS profile"
echo " -h, --help show this message and exit"
echo " -l, --leader LEADER Name of the leader local docker image"
echo " -w, --worker WORKER Name of the worker local docker image"
}
failure() {
echo "Failure: $2"
exit $1
}
POSITIONAL_ARGS=()
if [ $# -lt 4 ];
then
echo "args: $*"
echo "arg count: $#"
print_args
exit -1;
fi
while [[ $# -gt 0 ]]; do
case $1 in
-l|--leader)
LEADER="$2"
shift # past argument
shift # past value
;;
-w|--worker)
WORKER="$2"
shift # past argument
shift # past value
;;
-p|--profile)
PROFILE="$2"
shift # past argument
shift # past value
;;
-j|--project)
PROJECT="$2"
shift
shift
;;
-a|--account)
ACCOUNT="$2"
shift
shift
;;
-r|--region)
REGION="$2"
shift
shift
;;
-h|--help)
echo "ABOUT TO RUN HELP"
print_args
shift # past argument
;;
-*|--*)
echo "Unknown option $1"
exit 1
;;
*)
echo "Unknown positional argument $1"
exit 1
;;
esac
done
# echo "Leader is: " $LEADER
# echo "Worker is: " $WORKER
# echo "Project is: " $PROJECT
# echo "Profile is: " $PROFILE
if [ -z ${PROJECT+x} ] || [ -z ${ACCOUNT+x} ] || [ -z ${REGION+x} ] ;
then
echo "account, region, project and profile are all required arguments"
exit 1;
fi
if [ -z ${PROFILE+x} ] ;
then
echo "Profile not set.";
else
PROFILE_ARG="--profile ${PROFILE}"
echo "Profile arg is: $PROFILE_ARG";
fi
aws $PROFILE_ARG --region $REGION ecr get-login-password | docker login --username AWS --password-stdin "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com || failure 1 "Line ${LINENO}"
if [ -z ${LEADER+x} ];
then
echo "No leader set";
else
docker tag "$LEADER" "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":leader || failure 2 "Line ${LINENO}"
docker push "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":leader || failure 3 "Line ${LINENO}"
fi
if [ -z ${WORKER+x} ];
then
echo "No worker set";
else
docker tag "$WORKER" "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":worker || failure 4 "Line ${LINENO}"
docker push "$ACCOUNT".dkr.ecr."$REGION".amazonaws.com/"$PROJECT":worker || failure 5 "Line ${LINENO}"
fi

View File

@ -0,0 +1,166 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
import time
import pprint
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Runner")
logger.setLevel(logging.INFO)
class EcsService:
def __init__(self, client):
self.ecs = client
def get_ecs_service(self, service_name):
"""Get ECS services and return Worker service name
Returns: ECS worker node servicename
"""
try:
response = self.ecs.list_services(
cluster='SatCompCluster',
)
for service in response['serviceArns']:
if service_name in service:
return service
except Exception as e:
logger.error(f"Failed to get ECS service names: {e}")
raise e
def update_ecs_service(self, leader_node_count, worker_node_count):
worker_service = self.get_ecs_service("SolverWorkerService")
leader_service = self.get_ecs_service("SolverLeaderService")
try:
ecs.update_service(
cluster="SatCompCluster",
service=leader_service,
desiredCount=leader_node_count
)
ecs.update_service(
cluster="SatCompCluster",
service=worker_service,
desiredCount=worker_node_count
)
except Exception as e:
logger.error(f"Failed to update ECS service: {e}")
raise e
def describe_ecs_services(self):
result = {}
worker_service = self.get_ecs_service("SolverWorkerService")
leader_service = self.get_ecs_service("SolverLeaderService")
try:
result = ecs.describe_services(
cluster="SatCompCluster",
services=[leader_service, worker_service]
)
except Exception as e:
logger.error(f"Failed to describe ECS service: {e}")
raise e
return result
class ScalingGroup:
def __init__(self, client) -> None:
self.asg_client = client
def update_asg(self, desired_count: str):
try:
response = self.asg_client.describe_auto_scaling_groups()['AutoScalingGroups']
for group in response:
if 'EcsInstanceAsg' in group["AutoScalingGroupName"]:
asg_name = group["AutoScalingGroupName"]
response = self.asg_client.update_auto_scaling_group(
AutoScalingGroupName= asg_name,
MaxSize=desired_count,
DesiredCapacity=desired_count,
)
except Exception as e:
logger.error(f"Failed to update ASG: {e}")
raise e
def await_completion(ecs_service, asg_client):
# wait for ECS setup/teardown to complete
start = time.time()
while True:
status = ecs_service.describe_ecs_services()
leader = status["services"][0]["deployments"][0]
leader_desired = leader["desiredCount"]
leader_pending = leader["pendingCount"]
leader_running = leader["runningCount"]
worker = status["services"][1]["deployments"][0]
worker_desired = worker["desiredCount"]
worker_pending = worker["pendingCount"]
worker_running = worker["runningCount"]
elapsed = time.time() - start
print(f"Waiting for ECS ({elapsed/60.:3.1f} mins)")
print(f" leader: {leader_desired} desired, {leader_pending} pending, {leader_running} running")
print(f" workers: {worker_desired} desired, {worker_pending} pending, {worker_running} running")
print("")
if (leader_desired==leader_running and worker_desired==worker_running):
print("ECS configuration complete")
return
time.sleep(30)
# MWW: I am disabling this output since the output 'lies' in the sense that it reports
# a failure in the usual case, before eventually succeeding.
# put this after the first sleep, since it's usually delayed
# asg_status = asg_client.describe_scaling_activities()
# only display the most recent message
# asg_update = asg_status["Activities"][0]
# print(f"Most recent AutoScaling Activity Log")
# print(f" StatusCode: {asg_update['StatusCode']}")
# print(f" StatusCause: {asg_update['Cause']}")
# print(f" StatusDescription: {asg_update['Description']}")
# print("")
#pprint.pprint(asg_update)
#print("")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('mode', choices = ["setup", "shutdown"], help = "AddInstances / DeleteInstances.")
parser.add_argument('--profile', default = "default", required = False, help = "AWS profile")
parser.add_argument('--workers', required = False, help = "Required Worker nodes count")
args = parser.parse_args()
if args.mode == 'setup':
# Setup Instances
worker_node_count = args.workers
leader_node_count = "1"
desired_count = str(int(worker_node_count)+1)
elif args.mode == 'shutdown':
# Shutdown instances
leader_node_count = worker_node_count = desired_count = "0"
session = boto3.Session(profile_name=args.profile)
ecs = session.client('ecs')
ecs_service = EcsService(ecs)
asg_client = session.client('autoscaling')
asg = ScalingGroup(asg_client)
asg.update_asg(int(desired_count))
try:
ecs_service.update_ecs_service(int(leader_node_count), int(worker_node_count))
except Exception as e:
logger.info(f"Failed to update ECS service. {e}")
logger.info("Updating ASG")
asg.update_asg("0")
# wait for ECS setup/teardown to complete
await_completion(ecs_service, asg_client)

View File

@ -0,0 +1,333 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
from botocore.exceptions import ClientError, ProfileNotFound
import time
import re
import json
import sys
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Runner")
logger.setLevel(logging.INFO)
class STS:
def __init__(self, sts_client) -> None:
self.sts = sts_client
def get_account_number(self) -> str:
try:
return self.sts.get_caller_identity()["Account"]
except Exception as e:
logger.error(f"Failed to get profile account number: {e}")
raise e
class ECR:
def __init__(self, ecr_client) -> None:
self.ecr = ecr_client
def delete_repository_images(self, repository_name) -> None:
try:
images = self.ecr.list_images(
repositoryName=repository_name
)
if images["imageIds"]:
self.ecr.batch_delete_image(
repositoryName=repository_name,
imageIds=images["imageIds"]
)
except self.ecr.exceptions.RepositoryNotFoundException as e:
logger.info(f"Repository {repository_name} not found (already deleted). Error: {e}")
except Exception as e:
logger.error(f"Failed to delete repository images: {e}")
raise e
def delete_project_repositories(self, project_name) -> None:
# These must be kept synchronized with Cfn.
repository_name = project_name
self.delete_repository_images(repository_name)
class S3Filesystem:
def __init__(self, s3_client) -> None:
self.s3 = s3_client
def delete_bucket(self, bucket_name) -> None:
try:
bucket = self.s3.Bucket(bucket_name)
bucket.objects.all().delete()
bucket.object_versions.delete()
bucket.Policy().delete()
except Exception as e:
logger.error(f"Failed to delete all objects in bucket: {e}")
raise e
def upload_file_to_s3(self, bucket_name: str, file_name: str, object_name=None) -> None:
"""Upload a file to an S3 bucket
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
try:
bucket = self.s3.Bucket(bucket_name)
bucket.upload_file(file_name, object_name)
except ClientError as e:
logging.error(f"Failed to upload file to s3: {e}")
raise e
class SSM:
def __init__(self, ssm_client) -> None:
self.ssm = ssm_client
# Get the recommended AMI.
# The instance AMI describes the
# [Amazon Machine Image](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html),
# the basic software image to be loaded onto each EC2 instance, that describes the
# operating system and other configuration information.
def get_ami_image(self) -> str:
try:
result = self.ssm.get_parameters(
Names=["/aws/service/ecs/optimized-ami/amazon-linux-2/recommended"])
logger.debug(f"Result of get_parameters is {result}")
result_value = json.loads(result["Parameters"][0]["Value"])
image_id = result_value["image_id"]
logger.info(f"Recommended image id: {image_id}")
return result_value["image_id"]
except Exception as e:
logger.error(f"Failed to call ssm.get_parameters to find the recommended AMI for linux: {e}")
raise e
class CloudFormation:
def __init__(self, cfn_client, stack_name: str) -> None:
self.stack_name = stack_name
self.cfn = cfn_client
def create_parameters(self, project_name, instance_type, ami_id, container_memory):
return [
{
'ParameterKey': "ProjectName",
'ParameterValue': project_name
},
{
'ParameterKey': "AvailZoneId",
'ParameterValue': "a"
},
{
'ParameterKey': "InstanceType",
'ParameterValue': instance_type
},
{
'ParameterKey': "AmiImageId",
'ParameterValue': ami_id
},
{
'ParameterKey': "ContainerMemory",
'ParameterValue': str(container_memory)
},
]
def get_stack(self):
return self.cfn.Stack(self.stack_name)
def create_cloudformation_stack(self, project_name, instance_type, ami_id, container_memory) -> None:
try:
cf_template = open('solver-infrastructure.yaml').read()
response = self.cfn.create_stack(
StackName=self.stack_name,
TemplateBody=cf_template,
Capabilities=["CAPABILITY_IAM"],
Parameters=self.create_parameters(project_name, instance_type, ami_id, container_memory)
)
except self.cfn.exceptions.AlreadyExistsException as exists:
logger.error("Stack already exists: perhaps it failed to properly create? Try deleting it with delete-solver-infrastructure.")
raise e
except Exception as e:
logger.error(f"Failed to create stack: {e}")
raise e
def delete_cloudformation_stack(self) -> None:
try:
stack = self.cfn.Stack(self.stack_name)
stack.delete()
except Exception as e:
logger.error(f"Failed to delete stack: {e}")
raise e
def update_cloudformation_stack(self, project_name, instance_type, ami_id, container_memory) -> None:
try:
cf_template = open('solver-infrastructure.yaml').read()
stack = self.cfn.Stack(self.stack_name)
response = stack.update(
StackName=self.stack_name,
TemplateBody=cf_template,
Capabilities=["CAPABILITY_IAM"],
Parameters=self.create_parameters(project_name, instance_type, ami_id, container_memory)
)
except Exception as e:
logger.error(f"Failed to create stack: {e}")
raise e
def await_completion(self, mode) -> bool:
while True:
stack = self.get_stack()
try:
status = stack.stack_status
except Exception as e:
logger.info(f"Stack {mode} operation in progress. Cloudformation states stack is deleted.")
return mode == 'delete'
logger.info(f"Stack {mode} operation in progress. Current status: {stack.stack_status}")
if "IN_PROGRESS" in stack.stack_status:
logger.info("Pausing 30 seconds")
# Sleep time added to reduce cloudformation api calls to get status
time.sleep(30)
elif ("ROLLBACK" in stack.stack_status or
"FAILED" in stack.stack_status):
logger.info(f"Cloudformation operation {mode} failed!")
return False
elif "COMPLETE" in stack.stack_status:
logger.info(f"Cloudformation operation {mode} succeeded!")
return True
else:
logger.error(f"Unexpected cloudformation state {stack.stack_status}")
raise Exception(f"Unexpected cloudformation state when awaiting completion: {stack.stack_status}")
def get_satcomp_bucket(account_number, region, project_name) -> str:
# needs to be synched with CloudFormation
return str(account_number) + '-' + region + '-' + project_name
def delete_resources(session, s3, project_name, bucket) -> None:
# delete s3 bucket
logger.info(f"Deleting S3 bucket {bucket}")
s3_client = session.client('s3')
try:
s3.delete_bucket(bucket)
except s3_client.exceptions.NoSuchBucket as e:
logger.info(f"No bucket {bucket} exists; perhaps it was already deleted.")
# delete ecr instances
ecr_client = session.client('ecr')
ecr = ECR(ecr_client)
logger.info(f"Deleting all images within ECR repository")
ecr.delete_project_repositories(project_name)
def arg_parser() -> dict:
parser = argparse.ArgumentParser()
parser.add_argument('--profile', required = False, help = "AWS profile (uses 'default' if not provided)")
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
parser.add_argument('--solver-type', required = True, choices = ('cloud', 'parallel'), help = "Either 'cloud' or 'parallel' depending on the desired configuration.")
parser.add_argument('--mode', required = False, default = 'create', choices = ('create', 'update', 'delete'), help = "One of 'create', 'update', 'delete': create, update or delete the infrastructure.")
# MWW: can add these back in for advanced users, but hiding for now.
# parser.add_argument('--dryrun', type=bool, required = False, help = "Dry run ONLY. Do not actually create resources.")
# parser.add_argument('--instance', required = False, help = "EXPERTS ONLY: Override default machine instance type")
# parser.add_argument('--ami', required = False, help = "EXPERTS ONLY: Override default instance AMI")
# parser.add_argument('--memory', type=int, required = True, help = "EXPERTS ONLY: Override default max memory for container (61000 for cloud, 253000 for parallel)")
args = parser.parse_args()
# argument checking.
# matcher = re.compile(allowable_project_names_mask)
if not re.match(r"^[a-z]([a-z\d\/_-])*$", args.project):
logger.error(f"Invalid project name '{args.project}'.")
logger.error(f"Project names must start with a lowercase letter and can consist only of lowercase numbers, digits, '/', '_', or '-' characters.")
sys.exit(-1)
return args
def main() -> None:
args = arg_parser()
if args.solver_type == "cloud":
instance_type = 'm6i.4xlarge'
memory = '61000'
elif args.solver_type == "parallel":
instance_type = 'm6i.16xlarge'
memory = '253000'
else:
raise Exception('Solver type argument must be one of "cloud" or "parallel"')
profile = args.profile
#profile = "default"
project_name = args.project
stack_name = f"solver-infrastructure-{project_name}"
#RBJ#
try:
if profile:
session = boto3.Session(profile_name=profile)
else:
session = boto3.Session()
except ProfileNotFound as e:
logger.error(f"Unable to create AWS session. Please check that default profile is set up in the ~/.aws/config file and has appropriate rights (or if --profile was provided, that this profile has appropriate rights)")
sys.exit(1)
if not session.region_name:
logger.error(f"Profile does not have a region defined. Please add a region (recommend: us-east-1) to profile '{profile}' in the ~/.aws/config file")
sys.exit(1)
try:
# link to AWS services
ssm_client = session.client('ssm')
ssm = SSM(ssm_client)
sts_client = session.client('sts')
sts = STS(sts_client)
s3 = session.resource('s3')
s3_file_system = S3Filesystem(s3)
cloudformation = session.resource('cloudformation')
cfn = CloudFormation(cloudformation, stack_name)
# set up parameters
account_number = sts.get_account_number()
region = session.region_name
ami_id = ssm.get_ami_image()
satcomp_bucket = get_satcomp_bucket(account_number, region, project_name)
# logger.info("Exiting early so as not to actually do anything")
# return
if args.mode == "update":
cfn.update_cloudformation_stack(project_name, instance_type, ami_id, memory)
cfn.await_completion("update")
elif args.mode == "delete":
delete_resources(session, s3_file_system, project_name, satcomp_bucket)
cfn.delete_cloudformation_stack()
cfn.await_completion("delete")
elif args.mode == "create":
cfn.create_cloudformation_stack(project_name, instance_type, ami_id, memory)
ok = cfn.await_completion("create")
if ok:
logger.info(f"Uploading test.cnf file to the {satcomp_bucket} bucket")
s3_file_system.upload_file_to_s3(satcomp_bucket, "test.cnf")
else:
logger.error(f"Unexpected operation {args.mode}")
raise Exception("Internal error: unexpected operation {args.mode}")
except ClientError as ce:
logger.error("An error occurred during an AWS operation. Usually this is caused by the AWS session having insufficient rights to resources. Please double check that the default profile is properly set up in your AWS Config.")
logger.error("Error description: {ce}")
raise ce
if __name__ == "__main__":
main()

View File

@ -0,0 +1,81 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
import json
import subprocess
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Quickstart Build")
logger.setLevel(logging.INFO)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--profile', required = False, help = "AWS profile")
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
args = parser.parse_args()
profile_args = ['--profile', args.profile] if args.profile else []
project_args = ['--project', args.project]
# run the create script
cmd = ['python3', 'manage-solver-infrastructure.py', \
'--solver-type', 'cloud', \
'--mode', 'create'] \
+ profile_args + project_args
logger.info(f"Creating the AWS infrastructure for project {args.project} using create-solver-project.")
logger.info("This operation will likely take 5-7 minutes.")
try:
result = subprocess.run(cmd)
pass
except Exception as e:
logger.error(f"Unexpected error {e}: Unable to run {cmd}. Is python3 installed and in your path?")
exit(-1)
if result.returncode:
logger.error(f"create-solver-infrastructure failed with error code {result}")
logger.error("Unable to create solver infrastructure...has it already been created?")
logger.error("If not, have you set up the default profile in your ~/.aws/config file?")
logger.error("If so, does your project name meet the expected format?")
exit(-1)
logger.info("Building the Docker images for the competition base containers")
cmd = ['cd ../docker/satcomp-images && ./build_satcomp_images.sh && cd ../../infrastructure']
try:
result = subprocess.run(cmd, shell=True)
except Exception as e:
logger.error(f"Unexpected error {e}: unable to run command to build base image: {cmd}.")
exit(-1)
if result.returncode:
logger.error("Unexpected error: Unable to build docker images for base containers.")
exit(-1)
logger.info("Building the Docker images for Mallob. This will require ~10 minutes.")
cmd = ['cd ../docker/mallob-images && ./build_mallob_images.sh && cd ../../infrastructure']
try:
result = subprocess.run(cmd, shell=True)
except Exception as e:
logger.error(f"Unexpected error {e}: unable to run command to build mallob image: {cmd}.")
exit(-1)
if result.returncode:
logger.error("Unexpected error: Unable to build docker images for base containers.")
exit(-1)
cmd = ['python3', 'docker-upload-to-ecr.py', \
'--leader', 'satcomp-mallob:leader', \
'--worker', 'satcomp-mallob:worker'] \
+ profile_args + project_args
logger.info("Uploading the Mallob Docker files to ECR.")
logger.info("This operation will likely take 2-5 minutes, depending on network bandwidth.")
try:
result = subprocess.run(cmd)
except Exception as e:
logger.error(f"Unexpected error {e}: Unable to upload Mallob docker images to ECR.")
exit(-1)
if result.returncode:
logger.error(f"Unexpected error (return code {result}). Unable to upload Mallob docker images to ECR")
exit(-1)
logger.info("Upload complete; quickstart-build successful!")
exit(0)

View File

@ -0,0 +1,114 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
import json
import subprocess
from botocore.exceptions import ProfileNotFound
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Quickstart Run")
logger.setLevel(logging.INFO)
WORKER_NODES = 3
class STS:
def __init__(self, sts_client) -> None:
self.sts = sts_client
def get_account_number(self) -> str:
try:
return self.sts.get_caller_identity()["Account"]
except Exception as e:
logger.error(f"Failed to get profile account number: {e}")
raise e
def argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument('--profile', required = False, help = "AWS profile")
parser.add_argument('--project', required = False, default = 'comp23', help = "ADVANCED USERS ONLY: Name of the project (default: 'comp23').")
parser.add_argument('--keep-alive', required = False, help = "If True, then cluster is not destroyed after solving.")
parser.add_argument('--s3-locations', nargs='+', required = False, help = "S3 URLs for problem location of the form: s3://BUCKET_NAME/OBJECT_LOCATION. Default is to run test.cnf from the default bucket. You can specify multiple URLs (space separated)")
return parser
if __name__ == "__main__":
parser = argument_parser()
args = parser.parse_args()
try:
if args.profile:
session = boto3.Session(profile_name=args.profile)
else:
session = boto3.Session()
except ProfileNotFound as e:
logger.error(f"Unable to create AWS session. Please check that default profile is set up in the ~/.aws/config file and has appropriate rights (or if --profile was provided, that this profile has appropriate rights)")
sys.exit(1)
sts = STS(session.client('sts'))
account_number = sts.get_account_number()
if not args.s3_locations:
s3_bucket = f"s3://{account_number}-us-east-1-{args.project}"
problem_locations = [f"{s3_bucket}/test.cnf"]
else:
problem_locations = args.s3_locations
profile_args = ['--profile', args.profile] if args.profile else []
# run the ecs-config script
cmd = ['python3', 'ecs-config',
'--workers', str(WORKER_NODES), \
'setup'] \
+ profile_args
logger.info(f"Setting up ECS cluster with {WORKER_NODES} worker nodes using ecs-config.")
logger.info(f"command that is being executed is: {' '.join(cmd)}")
logger.info("This operation will likely take 5-7 minutes.")
logger.info("***Note that while configuration is in process, the system will report Autoscaling failures. This is normal, but should not persist for more than 10 minutes.***")
try:
result = subprocess.run(cmd)
pass
except Exception as e:
logger.error(f"Unexpected error {e}: Unable to run {cmd}. ")
exit(-1)
if result.returncode:
logger.error(f"ecs-config failed with error code {result}")
logger.error("Have you set up the default profile in your ~/.aws/config file?")
exit(-1)
for problem_location in problem_locations:
cmd = ['python3', 'send_message', '--location', problem_location, '--workers', str(WORKER_NODES), '--await-response', 'True'] + profile_args
logger.info("Sending a message to the cluster to run the `temp.cnf' problem.")
logger.info("It is stored in the S3 location: {problem_location}")
logger.info("Please go inspect the CloudWatch logs for project {args.project} to see it running, as described in the Infrastructure README.")
logger.info(f"command that is being executed is: {' '.join(cmd)}")
try:
result = subprocess.run(cmd)
logger.info(f"Send message completed. Intermediate files and stdout/stderr are available in {s3_bucket}/tmp")
except Exception as e:
logger.error(f"Unexpected error {e}: unable to run command to send message.")
if result.returncode:
logger.error("Unexpected error: Command to send message to queue failed.")
if args.keep_alive:
logger.info("Keep-alive option selected; not deleting cluster")
else:
logger.info("Deleting the cluster. This will require a minute or two.")
cmd = ['python3', 'ecs-config', 'shutdown'] + profile_args
logger.info("Tearing down the cluster.")
logger.info(f"command that is being executed is: {' '.join(cmd)}")
try:
result = subprocess.run(cmd)
except Exception as e:
logger.error(f"Unexpected error {e}: unable to run command: {cmd}.")
exit(-1)
if result.returncode:
logger.error("Unexpected error returned by cluster teardown. PLEASE MANUALLY CHECK WHETHER CLUSTER WAS DELETED.")
exit(-1)
logger.info("Run complete; quickstart-run successful!")
exit(0)

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 178 KiB

View File

@ -0,0 +1,172 @@
#!/usr/bin/env python3
import argparse
import logging
import boto3
import json
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Runner")
logger.setLevel(logging.INFO)
QUEUE_WAIT_TIME = 10
class SqsService:
def __init__(self, client):
self.sqs = client
def get_satcomp_queue(self):
"""Get SQS services and return sat comp queue.
Returns: SQS SatCompQueue
"""
try:
response = self.sqs.list_queues()
for service in response['QueueUrls']:
if service.endswith('SatCompQueue'):
return service
raise "No queue ending with 'SatCompQueue' "
except Exception as e:
logger.error(f"Failed to get SQS queue: {e}")
raise e
def get_satcomp_output_queue(self):
"""Get SQS services and return sat comp queue.
Returns: SQS SatCompQueue
"""
try:
response = self.sqs.list_queues()
for service in response['QueueUrls']:
if service.endswith('SatCompOutputQueue'):
return service
raise "No queue ending with 'SatCompOutputQueue' "
except Exception as e:
logger.error(f"Failed to get SQS queue: {e}")
raise e
def send_message(self, location, workers, timeout, solverName, language, solverOptions):
# Expected message structure:
"""{
"formula" : {
"value" : <s3 url>,
"language": "SMTLIB2" | "DIMACS",
},
"solverConfig" : {
"solverName" : "",
"solverOptions" : [],
"taskTimeoutSeconds" : 5
},
"num_workers": 0
}"""
queue = self.get_satcomp_queue()
message_body = { \
"formula": { \
"value": location, \
"language": language \
}, \
"solverConfig" : { \
"solverName" : solverName, \
"solverOptions" : solverOptions, \
"taskTimeoutSeconds" : timeout, \
}, \
"num_workers": workers \
}
message_body_str = json.dumps(message_body, indent = 4)
try:
response = self.sqs.send_message(
QueueUrl = queue,
MessageBody = message_body_str
)
except Exception as e:
logger.error(f"Failed to send message: Exception: {e}")
raise e
#
#
# {
# 'Messages': [
# {
# 'MessageId': 'string',
# 'ReceiptHandle': 'string',
# 'MD5OfBody': 'string',
# 'Body': 'string',
# 'Attributes': {
# 'string': 'string'
# },
# 'MD5OfMessageAttributes': 'string',
# 'MessageAttributes': {
# 'string': {
# 'StringValue': 'string',
# 'BinaryValue': b'bytes',
# 'StringListValues': [
# 'string',
# ],
# 'BinaryListValues': [
# b'bytes',
# ],
# 'DataType': 'string'
# }
# }
# },
# ]
# }
def receive_and_delete_message(self, timeout):
queue = self.get_satcomp_output_queue()
logger.info(f"Receiving and deleting message from queue {queue}")
total_time = 0
while (total_time < timeout + 5):
logger.info(f"Waiting up to 10s for a message.")
response = self.sqs.receive_message(
QueueUrl = queue,
WaitTimeSeconds = QUEUE_WAIT_TIME
)
if response["Messages"]:
for msg in response["Messages"]:
body = msg["Body"]
logger.info(f"Response from receive_message was: {body}")
self.sqs.delete_message(
QueueUrl = queue,
ReceiptHandle = msg["ReceiptHandle"]
)
return response
total_time = total_time + QUEUE_WAIT_TIME
logger.error("Solver did not complete within expected timeout.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--profile', required = False, help = "AWS profile")
parser.add_argument('--location', required = True, help = "S3 location for CNF file")
parser.add_argument('--workers', required = True, type=int, help = "Required Worker nodes count")
parser.add_argument('--timeout', type=int, help = "Timeout value for the infrastructure to interrupt the solver", default = 60)
parser.add_argument('--name', help = "Name of solver to be invoked (passed through to the solver). Default: empty string", default = "")
parser.add_argument('--format', help = "Problem format for the problem to be solved.", default = "")
parser.add_argument('--args', nargs='+', help="Arguments to pass through to the solver (--args accepts a space-delimited list of arguments). Default: empty list", default = [])
parser.add_argument('--await-response', required = False, help = "If true, then solver will poll output queue for response message, display, and delete it.")
args = parser.parse_args()
profile = args.profile
# Send message
session = boto3.Session(profile_name=profile)
sqs_client = session.client('sqs')
sqs = SqsService(sqs_client)
try:
sqs.send_message(args.location, args.workers, args.timeout, args.name, args.format, args.args)
if args.await_response:
sqs.receive_and_delete_message(args.timeout)
except Exception as e:
logger.info(f"Failed to send message. {e}")

View File

@ -0,0 +1,78 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
AWSTemplateFormatVersion: 2010-09-09
Parameters:
emailAddress:
Type: String
Description: "Basic Budget"
Resources:
BudgetExample:
Type: "AWS::Budgets::Budget"
Properties:
Budget:
BudgetLimit:
Amount: 1000
Unit: USD
TimeUnit: MONTHLY
BudgetType: COST
NotificationsWithSubscribers:
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 99
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 95
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 90
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 80
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 70
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 50
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 30
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress
- Notification:
NotificationType: ACTUAL
ComparisonOperator: GREATER_THAN
Threshold: 10
Subscribers:
- SubscriptionType: EMAIL
Address: !Ref emailAddress

View File

@ -0,0 +1,686 @@
# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
---
AWSTemplateFormatVersion: '2010-09-09'
Description: 'AWS CloudFormation Sample Template Managed Single ECS Job Queue: This
template demonstrates the usage of simple Job Queue and EC2 style Compute Environment
along with multi-node jobs (on a relatively small scale: 4 instances, 16 cores each).
N.B.: This is all boilerplate until the EcsInstanceRole! '
Parameters:
ProjectName:
Type: String
Default: "proofs"
Description: "S3 bucket will be AccountId-Region-ProjectName"
AvailZoneId:
Type: String
Default: "a"
Description: "Availability Zone ID"
InstanceType:
Type: String
Default: "m6i.4xlarge"
Description: "Instance type"
ContainerMemory:
Type: String
Description: "Memory Size for containers (61000 for cloud, 253000 for parallel)"
AmiImageId:
Type: String
Description: "Ami for your instances"
Resources:
VPC:
Type: AWS::EC2::VPC
Properties:
CidrBlock: 10.0.0.0/16
EnableDnsHostnames: true
EnableDnsSupport: true
SecondCidr:
Type: AWS::EC2::VPCCidrBlock
Properties:
CidrBlock: 10.1.0.0/16
VpcId: !Ref VPC
InternetGateway:
Type: AWS::EC2::InternetGateway
EIP:
Type: 'AWS::EC2::EIP'
Properties:
Domain: vpc
VPCGatewayAttachment:
Type: AWS::EC2::VPCGatewayAttachment
Properties:
VpcId:
Ref: VPC
InternetGatewayId:
Ref: InternetGateway
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: EC2 Security Group for instances launched in the VPC by Batch
VpcId:
Ref: VPC
SecurityGroupIngress:
- CidrIp: 10.0.0.0/0
Description: SSH port
FromPort: 0
IpProtocol: TCP
ToPort: 65535
NatGateway:
Type: 'AWS::EC2::NatGateway'
Properties:
AllocationId: !GetAtt 'EIP.AllocationId'
SubnetId: !Ref SubnetPublic
NatAttachment:
Type: AWS::EC2::VPCGatewayAttachment
Properties:
InternetGatewayId: !Ref InternetGateway
VpcId: !Ref VPC
SubnetPrivate:
Type: AWS::EC2::Subnet
DependsOn: SecondCidr
Properties:
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
CidrBlock: 10.1.0.0/16
VpcId: !Ref VPC
RouteTablePrivate:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: VPC
RoutePrivate:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: RouteTablePrivate
DestinationCidrBlock: 0.0.0.0/0
NatGatewayId: !Ref NatGateway
SubnetRouteTableAssociationPrivate:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId: !Ref RouteTablePrivate
SubnetId: !Ref SubnetPrivate
SubnetPublic:
Type: AWS::EC2::Subnet
Properties:
AvailabilityZone: !Sub "${AWS::Region}${AvailZoneId}"
CidrBlock: 10.0.1.0/24
VpcId: !Ref VPC
MapPublicIpOnLaunch: 'True'
RouteTablePublic:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: VPC
RoutePublic:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: RouteTablePublic
DestinationCidrBlock: 0.0.0.0/0
GatewayId: !Ref InternetGateway
SubnetRouteTableAssociationPublic:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId: !Ref RouteTablePublic
SubnetId: !Ref SubnetPublic
EcsCluster:
Type: AWS::ECS::Cluster
Properties:
ClusterName: SatCompCluster
ClusterSettings:
- Name: containerInsights
Value: enabled
Ec2AutoscaleInstanceProfile:
Type: "AWS::IAM::InstanceProfile"
Properties:
Path: "/"
Roles:
- Ref: "InstanceRole"
InstanceRole:
Type: AWS::IAM::Role
Properties:
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ec2.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: ecs-service
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Rules which allow ECS to attach network interfaces to instances
# on your behalf in order for awsvpc networking mode to work right
- 'ec2:AttachNetworkInterface'
- 'ec2:CreateNetworkInterface'
- 'ec2:CreateNetworkInterfacePermission'
- 'ec2:DeleteNetworkInterface'
- 'ec2:DeleteNetworkInterfacePermission'
- 'ec2:Describe*'
- 'ec2:DetachNetworkInterface'
- 'elasticfilesystem:*'
- 'cloudwatch:*'
- 'ecs:*'
# Rules which allow ECS to update load balancers on your behalf
# with the information sabout how to send traffic to your containers
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
- 'elasticloadbalancing:DeregisterTargets'
- 'elasticloadbalancing:Describe*'
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
- 'elasticloadbalancing:RegisterTargets'
- 's3:GetObject'
- 's3:GetObjectVersion'
Resource: '*'
EcsInstanceLc:
Type: AWS::AutoScaling::LaunchConfiguration
Properties:
ImageId: !Ref AmiImageId
InstanceType: !Select [ 0, [ !Ref InstanceType ] ]
AssociatePublicIpAddress: true
IamInstanceProfile: !GetAtt Ec2AutoscaleInstanceProfile.Arn
SecurityGroups: [ !Ref SecurityGroup ]
BlockDeviceMappings:
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 30
VolumeType: gp2
- DeviceName: /dev/xvdcz
Ebs:
VolumeSize: 22
VolumeType: gp2
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
echo ECS_CLUSTER=${EcsCluster} >> /etc/ecs/ecs.config
yum install -y aws-cfn-bootstrap python-pip
pip install awscli boto3
/opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
EcsInstanceAsg:
Type: AWS::AutoScaling::AutoScalingGroup
UpdatePolicy:
AutoScalingRollingUpdate:
MaxBatchSize: 10
MinSuccessfulInstancesPercent: 95
PauseTime: PT30M
SuspendProcesses: [ HealthCheck, ReplaceUnhealthy, AZRebalance, AlarmNotification,
ScheduledActions ]
WaitOnResourceSignals: 'true'
Properties:
VPCZoneIdentifier: [ !Ref SubnetPrivate ]
LaunchConfigurationName: !Ref EcsInstanceLc
MinSize: '0'
MaxSize: '0'
DesiredCapacity: '0'
Tags:
- Key: IsAutoscaledCluster
PropagateAtLaunch: true
Value: "true"
- Key: "Patch Group"
PropagateAtLaunch: true
Value: "ManagedClusterPatchGroup"
AutoscaleCapacityProvider:
Type: AWS::ECS::CapacityProvider
Properties:
AutoScalingGroupProvider:
AutoScalingGroupArn: !Ref EcsInstanceAsg
ManagedTerminationProtection: DISABLED
ManagedScaling:
Status: DISABLED
Name: AutoscaleCapacityProvider
# A security group for the containers we will run in Fargate.
# Rules are added to this security group based on what ingress you
# add for the cluster.
ContainerSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Access to the Fargate containers
VpcId: !Ref 'VPC'
# A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets
# on your AWS account
AutoscalingRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ application-autoscaling.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: service-autoscaling
PolicyDocument:
Statement:
- Effect: Allow
Action:
- 'application-autoscaling:*'
- 'cloudwatch:DescribeAlarms'
- 'cloudwatch:PutMetricAlarm'
- 'ecs:DescribeServices'
- 'ecs:UpdateService'
Resource: '*'
# This is an IAM role which authorizes ECS to manage resources on your
# account on your behalf, such as updating your load balancer with the
# details of where your containers are, so that traffic can reach your
# containers.
ECSRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ecs.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: ecs-service
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Rules which allow ECS to attach network interfaces to instances
# on your behalf in order for awsvpc networking mode to work right
- 'ec2:AttachNetworkInterface'
- 'ec2:CreateNetworkInterface'
- 'ec2:CreateNetworkInterfacePermission'
- 'ec2:DeleteNetworkInterface'
- 'ec2:DeleteNetworkInterfacePermission'
- 'ec2:Describe*'
- 'ec2:DetachNetworkInterface'
- 'elasticfilesystem:*'
# Rules which allow ECS to update load balancers on your behalf
# with the information sabout how to send traffic to your containers
- 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
- 'elasticloadbalancing:DeregisterTargets'
- 'elasticloadbalancing:Describe*'
- 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
- 'elasticloadbalancing:RegisterTargets'
Resource: '*'
SsmVpcEndpoint:
Type: AWS::EC2::VPCEndpoint
Properties:
VpcEndpointType: Interface
ServiceName: !Sub 'com.amazonaws.${AWS::Region}.ssm'
VpcId: !Ref VPC
SecurityGroupIds: [ !Ref SecurityGroup ]
SubnetIds: [ !Ref SubnetPrivate ]
PrivateDnsEnabled: true
EcsTaskRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonEC2FullAccess
- arn:aws:iam::aws:policy/AmazonS3FullAccess
Policies:
- PolicyName: !Sub "project-metrics-${AWS::Region}-${ProjectName}"
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- cloudwatch:PutMetricData
Effect: Allow
Resource: "*"
- PolicyName: SatCompQueueExecutionRolePolicy
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- "sqs:GetQueueAttributes"
- "sqs:SendMessage"
- "sqs:ReceiveMessage"
- "sqs:DeleteMessage"
- "sqs:DeleteMessageBatch"
- "sqs:GetQueueUrl"
Effect: Allow
Resource: "*"
- PolicyName: SatCompDynamoExecutionRolePolicy
PolicyDocument:
Version: 2012-10-17
Statement:
- Action:
- "dynamodb:UpdateItem"
- "dynamodb:Scan"
Effect: Allow
Resource: "*"
SolverLeaderTaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
TaskRoleArn: !GetAtt EcsTaskRole.Arn
NetworkMode: awsvpc
RequiresCompatibilities:
- EC2
Volumes:
- Name: SatVolume
EFSVolumeConfiguration:
FilesystemId: !Ref FileSystem
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId: !Ref AccessPoint
ContainerDefinitions:
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:leader"
Name: !Sub "${ProjectName}-leader"
MemoryReservation: 61000
Environment:
- Name: SQS_QUEUE_NAME
Value: !GetAtt SatCompQueue.QueueName
- Name: SQS_OUTPUT_QUEUE_NAME
Value: !GetAtt SatCompOutputQueue.QueueName
- Name: AWS_DEFAULT_REGION
Value: !Ref AWS::Region
- Name: SATCOMP_BUCKET_NAME
Value: !Ref SatCompBucket
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub "/ecs/${ProjectName}-leader"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
MountPoints:
- ContainerPath: '/mount/efs'
SourceVolume: SatVolume
PortMappings:
- HostPort: 22
ContainerPort: 22
Protocol: TCP
SolverLogGroupLeader:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub "/ecs/${ProjectName}-leader"
RetentionInDays: 1827
SolverWorkerTaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ExecutionRoleArn: !GetAtt ECSTaskExecutionRole.Arn
TaskRoleArn: !GetAtt EcsTaskRole.Arn
NetworkMode: awsvpc
RequiresCompatibilities:
- EC2
Volumes:
- Name: SatVolume
EFSVolumeConfiguration:
FilesystemId: !Ref FileSystem
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId: !Ref AccessPoint
ContainerDefinitions:
- Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${ProjectName}:worker"
Name: !Sub "${ProjectName}-worker"
MemoryReservation: 61000
Environment:
- Name: SQS_QUEUE_NAME
Value: !GetAtt SatCompQueue.QueueName
- Name: SQS_OUTPUT_QUEUE_NAME
Value: !GetAtt SatCompOutputQueue.QueueName
- Name: AWS_DEFAULT_REGION
Value: !Ref AWS::Region
- Name: SATCOMP_BUCKET_NAME
Value: !Ref SatCompBucket
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub "/ecs/${ProjectName}-worker"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
MountPoints:
- ContainerPath: '/mount/efs'
SourceVolume: SatVolume
PortMappings:
- HostPort: 22
ContainerPort: 22
Protocol: TCP
SolverLogGroupWorker:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub "/ecs/${ProjectName}-worker"
RetentionInDays: 1827
ECSTaskExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: [ ecs-tasks.amazonaws.com ]
Action: [ 'sts:AssumeRole' ]
Path: /
Policies:
- PolicyName: AmazonECSTaskExecutionRolePolicy
PolicyDocument:
Statement:
- Effect: Allow
Action:
# Allow the ECS Tasks to download images from ECR
- 'ecr:GetAuthorizationToken'
- 'ecr:BatchCheckLayerAvailability'
- 'ecr:GetDownloadUrlForLayer'
- 'ecr:BatchGetImage'
# Allow the ECS tasks to upload logs to CloudWatch
- 'logs:CreateLogStream'
- 'logs:PutLogEvents'
Resource: '*'
SolverLeaderService:
Type: AWS::ECS::Service
Properties:
Cluster:
Ref: EcsCluster
DesiredCount: 0
LaunchType: EC2
NetworkConfiguration:
AwsvpcConfiguration:
SecurityGroups:
- !Ref SecurityGroup
Subnets:
- !Ref SubnetPrivate
TaskDefinition:
Ref: SolverLeaderTaskDefinition
SolverWorkerService:
Type: AWS::ECS::Service
Properties:
Cluster:
Ref: EcsCluster
DesiredCount: 0
LaunchType: EC2
NetworkConfiguration:
AwsvpcConfiguration:
SecurityGroups:
- !Ref SecurityGroup
Subnets:
- !Ref SubnetPrivate
TaskDefinition:
Ref: SolverWorkerTaskDefinition
AccessPoint:
Type: AWS::EFS::AccessPoint
Properties:
FileSystemId: !Ref FileSystem
PosixUser:
Uid: "1001"
Gid: "1001"
RootDirectory:
CreationInfo:
OwnerGid: "1001"
OwnerUid: "1001"
Permissions: "0755"
Path: "/sat-comp"
FileSystem:
Type: AWS::EFS::FileSystem
Properties:
PerformanceMode: generalPurpose
EFSMountTarget:
Type: AWS::EFS::MountTarget
Properties:
FileSystemId: !Ref FileSystem
SecurityGroups:
- !Ref SecurityGroup
SubnetId: !Ref SubnetPrivate
SatCompQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompQueue"
VisibilityTimeout: 5
ReceiveMessageWaitTimeSeconds: 20
KmsDataKeyReusePeriodSeconds: 3600
KmsMasterKeyId: "alias/aws/sqs"
SatCompOutputQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub "${AWS::AccountId}-${AWS::Region}-SatCompOutputQueue"
VisibilityTimeout: 5
ReceiveMessageWaitTimeSeconds: 20
KmsDataKeyReusePeriodSeconds: 3600
KmsMasterKeyId: "alias/aws/sqs"
NodeManifest:
Type: AWS::DynamoDB::Table
Properties:
AttributeDefinitions:
-
AttributeName: nodeId
AttributeType: S
BillingMode: PAY_PER_REQUEST
KeySchema:
-
AttributeName: nodeId
KeyType: HASH
TableName: SatCompNodeManifest
TaskEndNotification:
Type: AWS::DynamoDB::Table
Properties:
AttributeDefinitions:
-
AttributeName: leaderIp
AttributeType: S
BillingMode: PAY_PER_REQUEST
KeySchema:
-
AttributeName: leaderIp
KeyType: HASH
TableName: TaskEndNotification
SolverLeaderEcr:
Type: AWS::ECR::Repository
Properties:
RepositoryName: !Sub "${ProjectName}"
LifecyclePolicy:
LifecyclePolicyText: '
{
"rules": [ {
"rulePriority": 10,
"description": "remove untagged images except the latest one",
"selection": {
"tagStatus": "untagged",
"countType": "imageCountMoreThan",
"countNumber": 1
},
"action": {
"type": "expire"
}
} ]
}'
# SolverWorkerEcr:
# Type: AWS::ECR::Repository
# Properties:
# RepositoryName: !Sub "${ProjectName}-worker"
# LifecyclePolicy:
# LifecyclePolicyText: '
# {
# "rules": [ {
# "rulePriority": 10,
# "description": "remove untagged images except the latest one",
# "selection": {
# "tagStatus": "untagged",
# "countType": "imageCountMoreThan",
# "countNumber": 1
# },
# "action": {
# "type": "expire"
# }
# } ]
# }'
SatCompBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub "${AWS::AccountId}-${AWS::Region}-${ProjectName}"
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true
Outputs:
Subnet:
Value:
Ref: SubnetPrivate
Export:
Name: SubnetId
SecurityGroupId:
Value:
Ref: SecurityGroup
Export:
Name: SecurityGroup

View File

@ -0,0 +1,4 @@
c
p cnf 3 2
1 -3 0
2 3 -1 0

View File

@ -0,0 +1,2 @@
#!/bin/sh
python3 ./manage-solver-infrastructure.py --mode update $*