10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/compute/standard_job_executor/StandardJobExecutor.h"
18 #include "BareMetalComputeServiceProperty.h"
19 #include "BareMetalComputeServiceMessagePayload.h"
20 #include "wrench/services/compute/workunit_executor/Workunit.h"
21 #include "wrench/services/helpers/HostStateChangeDetector.h"
55 std::map<std::string, std::string> default_property_values = {
62 std::map<std::string, double> default_messagepayload_values = {
88 const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
89 std::string scratch_space_mount_point,
90 std::map<std::string, std::string> property_list = {},
91 std::map<std::string, double> messagepayload_list = {}
96 const std::vector<std::string> compute_hosts,
97 std::string scratch_space_mount_point,
98 std::map<std::string, std::string> property_list = {},
99 std::map<std::string, double> messagepayload_list = {}
107 void submitStandardJob(
StandardJob *job, std::map<std::string, std::string> &service_specific_args)
override;
109 void submitPilotJob(
PilotJob *job, std::map<std::string, std::string> &service_specific_args)
override;
111 void terminateStandardJob(
StandardJob *job)
override;
113 void terminatePilotJob(
PilotJob *job)
override;
125 void validateProperties();
129 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
130 std::map<std::string, std::string> property_list,
131 std::map<std::string, double> messagepayload_list,
134 std::shared_ptr<StorageService> scratch_space);
138 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
139 std::map<std::string, std::string> property_list,
140 std::map<std::string, double> messagepayload_list,
141 std::shared_ptr<StorageService> scratch_space);
144 void initiateInstance(
const std::string &hostname,
145 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
146 std::map<std::string, std::string> property_list,
147 std::map<std::string, double> messagepayload_list,
152 std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
155 std::map<std::string, double> ram_availabilities;
156 std::map<std::string, unsigned long> running_thread_counts;
158 unsigned long total_num_cores;
163 std::shared_ptr<Alarm> death_alarm =
nullptr;
167 std::map<StandardJob *, std::set<WorkflowFile*>> files_in_scratch;
170 std::set<StandardJob *> running_jobs;
173 std::map<StandardJob *, std::map<WorkflowTask *, std::tuple<std::string, unsigned long>>> job_run_specs;
176 std::map<StandardJob *, std::set<std::shared_ptr<Workunit>>> all_workunits;
178 std::deque<std::shared_ptr<Workunit>> ready_workunits;
180 std::map<StandardJob *, std::set<std::shared_ptr<Workunit>>> completed_workunits;
183 std::map<StandardJob *, std::set<std::shared_ptr<WorkunitExecutor>>> workunit_executors;
186 void storeFilesStoredInScratch(std::set<WorkflowFile*> scratch_files);
189 void cleanUpScratch();
195 void terminate(
bool notify_pilot_job_submitters);
197 void failCurrentStandardJobs();
199 void processWorkunitExecutorCompletion(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit);
201 void processWorkunitExecutorFailure(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit, std::shared_ptr<FailureCause> cause);
203 void processWorkunitExecutorCrash(std::shared_ptr<WorkunitExecutor> workunit_executor);
205 void forgetWorkunitExecutor(std::shared_ptr<WorkunitExecutor> workunit_executor);
207 void processStandardJobTerminationRequest(
StandardJob *job,
const std::string &answer_mailbox);
209 bool processNextMessage();
211 void dispatchReadyWorkunits();
218 enum JobTerminationCause {
223 COMPUTE_SERVICE_KILLED
226 void terminateRunningStandardJob(
StandardJob *job, JobTerminationCause termination_cause);
228 void failRunningStandardJob(
StandardJob *job, std::shared_ptr<FailureCause> cause);
230 void processGetResourceInformation(
const std::string &answer_mailbox);
232 void processSubmitPilotJob(
const std::string &answer_mailbox,
PilotJob *job, std::map<std::string, std::string> service_specific_args);
234 void processSubmitStandardJob(
const std::string &answer_mailbox,
StandardJob *job,
235 std::map<std::string, std::string> &service_specific_arguments);
237 std::tuple<std::string, unsigned long> pickAllocation(
WorkflowTask *task,
238 std::string required_host,
unsigned long required_num_cores,
double required_ram,
239 std::set<std::string> &hosts_to_avoid);
241 bool jobCanRun(
StandardJob *job, std::map<std::string, std::string> &service_specific_arguments);
243 bool isThereAtLeastOneHostWithResources(
unsigned long num_cores,
double ram);
245 void cleanup(
bool has_terminated_cleanly,
int return_value)
override;
247 bool areAllComputeResourcesDownWithNoWUERunning();
252 std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
258 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:44
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:54
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:26
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:33
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:36
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask it for information on its resour...
Definition: ComputeServiceMessagePayload.h:56
The compute service base class.
Definition: ComputeService.h:35
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudComputeService.h:37
A standard (i.e., non-pilot) workflow job that can be submitted to a ComputeService by a WMS (via a J...
Definition: StandardJob.h:37
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:52
A batch-scheduled compute service that manages a set of compute hosts and controls access to their re...
Definition: BatchComputeService.h:49
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:30
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:32
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:34
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:40
A computational task in a Workflow.
Definition: WorkflowTask.h:27
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:58
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:31
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:42
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:28
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:45
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:38
static const std::string SUPPORTS_PILOT_JOBS
Whether the compute service supports pilot jobs (true or false)
Definition: ComputeServiceProperty.h:26
A pilot (i.e., non-standard) workflow job that can be submitted to a ComputeService by a WMS (via a J...
Definition: PilotJob.h:29
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed.
Definition: ComputeServiceMessagePayload.h:46
static const std::string SUPPORTS_STANDARD_JOBS
Whether the compute service supports standard jobs (true or false)
Definition: ComputeServiceProperty.h:24