10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/compute/standard_job_executor/StandardJobExecutor.h"
18 #include "BareMetalComputeServiceProperty.h"
19 #include "BareMetalComputeServiceMessagePayload.h"
20 #include "wrench/services/compute/workunit_executor/Workunit.h"
21 #include "wrench/services/helpers/HostStateChangeDetector.h"
55 std::map<std::string, std::string> default_property_values = {
62 std::map<std::string, double> default_messagepayload_values = {
90 const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
91 std::string scratch_space_mount_point,
98 const std::vector<std::string> compute_hosts,
99 std::string scratch_space_mount_point,
108 void submitStandardJob(std::shared_ptr<StandardJob> job,
const std::map<std::string, std::string> &service_specific_args)
override;
110 void submitPilotJob(std::shared_ptr<PilotJob> job,
const std::map<std::string, std::string> &service_specific_args)
override;
126 void validateProperties();
130 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
134 std::shared_ptr<PilotJob> pj, std::string suffix,
135 std::shared_ptr<StorageService> scratch_space);
139 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
142 std::shared_ptr<StorageService> scratch_space);
145 void initiateInstance(
const std::string &
hostname,
146 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
150 std::shared_ptr<PilotJob> pj);
153 std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
156 std::map<std::string, double> ram_availabilities;
157 std::map<std::string, unsigned long> running_thread_counts;
159 unsigned long total_num_cores;
164 std::shared_ptr<Alarm> death_alarm =
nullptr;
166 std::shared_ptr<PilotJob> containing_pilot_job;
168 std::map<std::shared_ptr<StandardJob> , std::set<WorkflowFile*>> files_in_scratch;
171 std::set<std::shared_ptr<StandardJob> > running_jobs;
174 std::map<std::shared_ptr<StandardJob> , std::map<WorkflowTask *, std::tuple<std::string, unsigned long>>> job_run_specs;
177 std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<Workunit>>> all_workunits;
179 std::deque<std::shared_ptr<Workunit>> ready_workunits;
181 std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<Workunit>>> completed_workunits;
184 std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<WorkunitExecutor>>> workunit_executors;
187 void storeFilesStoredInScratch(std::set<WorkflowFile*> scratch_files);
190 void cleanUpScratch();
196 void terminate(
bool notify_pilot_job_submitters);
198 void failCurrentStandardJobs();
200 void processWorkunitExecutorCompletion(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit);
202 void processWorkunitExecutorFailure(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit, std::shared_ptr<FailureCause> cause);
204 void processWorkunitExecutorCrash(std::shared_ptr<WorkunitExecutor> workunit_executor);
206 void forgetWorkunitExecutor(std::shared_ptr<WorkunitExecutor> workunit_executor);
208 void processStandardJobTerminationRequest(std::shared_ptr<StandardJob> job,
const std::string &answer_mailbox);
210 bool processNextMessage();
212 void dispatchReadyWorkunits();
219 enum JobTerminationCause {
224 COMPUTE_SERVICE_KILLED
227 void terminateRunningStandardJob(std::shared_ptr<StandardJob> job, JobTerminationCause termination_cause);
229 void failRunningStandardJob(std::shared_ptr<StandardJob> job, std::shared_ptr<FailureCause> cause);
231 void processGetResourceInformation(
const std::string &answer_mailbox);
233 void processSubmitPilotJob(
const std::string &answer_mailbox, std::shared_ptr<PilotJob> job, std::map<std::string, std::string> service_specific_args);
235 void processSubmitStandardJob(
const std::string &answer_mailbox, std::shared_ptr<StandardJob> job,
236 std::map<std::string, std::string> &service_specific_arguments);
238 void processIsThereAtLeastOneHostWithAvailableResources(
239 const std::string &answer_mailbox,
unsigned long num_cores,
double ram);
241 std::tuple<std::string, unsigned long> pickAllocation(
WorkflowTask *task,
242 std::string required_host,
unsigned long required_num_cores,
double required_ram,
243 std::set<std::string> &hosts_to_avoid);
245 bool jobCanRun(std::shared_ptr<StandardJob> job, std::map<std::string, std::string> &service_specific_arguments);
247 bool isThereAtLeastOneHostWithResources(
unsigned long num_cores,
double ram);
249 void cleanup(
bool has_terminated_cleanly,
int return_value)
override;
251 bool areAllComputeResourcesDownWithNoWUERunning();
256 std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
262 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:44
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask is one host has some resources a...
Definition: ComputeServiceMessagePayload.h:60
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:54
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:26
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:33
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:36
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask it for information on its resour...
Definition: ComputeServiceMessagePayload.h:56
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message by the the daemon to state whether one host has some resou...
Definition: ComputeServiceMessagePayload.h:62
The compute service base class.
Definition: ComputeService.h:33
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudComputeService.h:36
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:52
A batch-scheduled compute service that manages a set of compute hosts and controls access to their re...
Definition: BatchComputeService.h:49
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:30
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:32
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:34
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:40
A computational task in a Workflow.
Definition: WorkflowTask.h:31
std::map< std::string, double > messagepayload_list
The service's messagepayload list.
Definition: Service.h:112
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:58
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:31
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:42
std::string hostname
The name of the host on which the daemon is running.
Definition: S4U_Daemon.h:51
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:28
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:46
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:38
static const std::string SUPPORTS_PILOT_JOBS
Whether the compute service supports pilot jobs (true or false)
Definition: ComputeServiceProperty.h:26
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed.
Definition: ComputeServiceMessagePayload.h:46
std::map< std::string, std::string > property_list
The service's property list.
Definition: Service.h:109
static const std::string SUPPORTS_STANDARD_JOBS
Whether the compute service supports standard jobs (true or false)
Definition: ComputeServiceProperty.h:24