WRENCH  1.11
Cyberinfrastructure Simulation Workbench
Overview Installation Getting Started WRENCH 101 WRENCH 102
BareMetalComputeService.h
1 
10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "BareMetalComputeServiceProperty.h"
18 #include "BareMetalComputeServiceMessagePayload.h"
19 #include "wrench/services/helper_services/host_state_change_detector/HostStateChangeDetector.h"
20 
21 
22 
23 namespace wrench {
24 
25  class Simulation;
26  class StorageService;
27  class FailureCause;
28  class Alarm;
29  class Action;
30  class ActionExecutionService;
31 
32 
46 
47  friend class CloudComputeService;
48  friend class BatchComputeService;
49 
50  private:
51 
52  WRENCH_PROPERTY_COLLECTION_TYPE default_property_values = {
56  };
57 
58 WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE default_messagepayload_values = {
86  };
87 
88  public:
89 
90  // Public Constructor
91  BareMetalComputeService(const std::string &hostname,
92  const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
93  std::string scratch_space_mount_point,
94  WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
95  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
96  );
97 
98  // Public Constructor
99  BareMetalComputeService(const std::string &hostname,
100  const std::vector<std::string> compute_hosts,
101  std::string scratch_space_mount_point,
102  WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
103  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
104  );
105 
106  virtual bool supportsStandardJobs() override;
107  virtual bool supportsCompoundJobs() override;
108  virtual bool supportsPilotJobs() override;
109 
110  /***********************/
112  /***********************/
113 
114  void submitCompoundJob(std::shared_ptr<CompoundJob> job, const std::map<std::string, std::string> &service_specific_args) override;
115 
116  void terminateCompoundJob(std::shared_ptr<CompoundJob> job) override;
117 
119 
120 
121  protected:
122  friend class JobManager;
123 
124  void validateServiceSpecificArguments(std::shared_ptr<CompoundJob> job,
125  std::map<std::string, std::string> &service_specific_args) override;
126 
127 
128  BareMetalComputeService(const std::string &hostname,
129  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
130  WRENCH_PROPERTY_COLLECTION_TYPE property_list,
131  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list,
132  double ttl,
133  std::shared_ptr<PilotJob> pj, std::string suffix,
134  std::shared_ptr<StorageService> scratch_space); // reference to upper level scratch space
135 
136  BareMetalComputeService(const std::string &hostname,
137  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
138  WRENCH_PROPERTY_COLLECTION_TYPE property_list,
139  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list,
140  std::shared_ptr<StorageService> scratch_space);
141 
142  void validateProperties();
143 
144 
145  // Low-level constructor helper method
146  void initiateInstance(const std::string &hostname,
147  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
148  WRENCH_PROPERTY_COLLECTION_TYPE property_list,
149  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list,
150  double ttl,
151  std::shared_ptr<PilotJob> pj);
152 
153 
154  protected:
155 
156  std::shared_ptr<Alarm> death_alarm = nullptr;
157  std::shared_ptr<PilotJob> containing_pilot_job; // In case this service is in fact a pilot job
158 
159 
160  std::unordered_map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<DataFile>>> files_in_scratch;
161 
162  std::set<std::shared_ptr<CompoundJob>> current_jobs;
163 
164  std::set<std::shared_ptr<Action>> not_ready_actions;
165  std::vector<std::shared_ptr<Action>> ready_actions;
166  std::set<std::shared_ptr<Action>> dispatched_actions;
167  std::unordered_map<std::shared_ptr<CompoundJob>, int> num_dispatched_actions_for_cjob;
168 
169  double ttl;
170  double death_date;
171  bool has_ttl;
172 
173 
174  // Add the scratch files of one standardjob to the list of all the scratch files of all the standard jobs inside the pilot job
175 // void storeFilesStoredInScratch(std::set<std::shared_ptr<DataFile>> scratch_files);
176 
177  // Cleanup the scratch if I am a pilot job
178  void cleanUpScratch();
179 
180  int main() override;
181 
182  // Helper functions to make main() a bit more palatable
183 
184  void terminate(bool send_failure_notifications, ComputeService::TerminationCause termination_cause);
185 
186  void processActionDone(std::shared_ptr<Action> action);
187 
188  void processCompoundJobTerminationRequest(std::shared_ptr<CompoundJob> job, simgrid::s4u::Mailbox *answer_mailbox);
189 
190  bool processNextMessage();
191 
192  void dispatchReadyActions();
193 
194 
195  void terminateCurrentCompoundJob(std::shared_ptr<CompoundJob> job, ComputeService::TerminationCause termination_cause);
196 
197  void processGetResourceInformation(simgrid::s4u::Mailbox *answer_mailbox, const std::string &key);
198 
199 // void processSubmitPilotJob(const std::string &answer_mailbox, std::shared_ptr<PilotJob> job, std::map<std::string, std::string> service_specific_args);
200 
201  void processSubmitCompoundJob(simgrid::s4u::Mailbox *answer_mailbox, std::shared_ptr<CompoundJob> job,
202  std::map<std::string, std::string> &service_specific_arguments);
203 
204  void processIsThereAtLeastOneHostWithAvailableResources(
205  simgrid::s4u::Mailbox *answer_mailbox, unsigned long num_cores, double ram);
206 
207 // std::tuple<std::string, unsigned long> pickAllocation(std::shared_ptr<WorkflowTask>task,
208 // std::string required_host, unsigned long required_num_cores, double required_ram,
209 // std::set<std::string> &hosts_to_avoid);
210 
211 // bool jobCanRun(std::shared_ptr<StandardJob> job, std::map<std::string, std::string> &service_specific_arguments);
212 //
213 // bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
214 
215  void cleanup(bool has_terminated_cleanly, int return_value) override;
216 
217 // bool areAllComputeResourcesDownWithNoWUERunning();
218 
219  static std::tuple<std::string, unsigned long> parseResourceSpec(const std::string &spec);
220 
221 
222  int exit_code = 0;
223 
224  std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
225 
226  std::shared_ptr<ActionExecutionService> action_execution_service;
227 
228  /***********************/
230  /***********************/
231 
232  };
233 };
234 
235 
236 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
wrench::ComputeServiceMessagePayload::PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:60
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask is one host has some resources a...
Definition: ComputeServiceMessagePayload.h:78
wrench::ComputeServiceMessagePayload::SUBMIT_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job submissio...
Definition: ComputeServiceMessagePayload.h:43
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:66
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:27
wrench::ComputeService::TerminationCause
TerminationCause
Job termination cause enum.
Definition: ComputeService.h:67
wrench::BareMetalComputeServiceMessagePayload::NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
static const std::string NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not have sufficie...
Definition: BareMetalComputeServiceMessagePayload.h:28
wrench::ComputeServiceMessagePayload::COMPOUND_JOB_DONE_MESSAGE_PAYLOAD
static const std::string COMPOUND_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a compou...
Definition: ComputeServiceMessagePayload.h:45
wrench::ServiceMessagePayload::DAEMON_STOPPED_MESSAGE_PAYLOAD
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:37
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:37
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to request information on its resources...
Definition: ComputeServiceMessagePayload.h:74
wrench::JobManager
A helper daemon (co-located with and explicitly started by an execution controller),...
Definition: JobManager.h:56
wrench::BareMetalComputeService::supportsStandardJobs
virtual bool supportsStandardJobs() override
Returns true if the service supports standard jobs.
Definition: BareMetalComputeService.cpp:959
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message by the the daemon to state whether one host has some resou...
Definition: ComputeServiceMessagePayload.h:80
wrench::BareMetalComputeService::supportsCompoundJobs
virtual bool supportsCompoundJobs() override
Returns true if the service supports compound jobs.
Definition: BareMetalComputeService.cpp:967
wrench::BareMetalComputeServiceProperty::TASK_STARTUP_OVERHEAD
static const std::string TASK_STARTUP_OVERHEAD
The overhead to start a thread, in seconds.
Definition: BareMetalComputeServiceProperty.h:28
wrench::ComputeService
The compute service base class.
Definition: ComputeService.h:34
wrench::CloudComputeService
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudComputeService.h:36
wrench::BareMetalComputeService
A compute service that manages a set of multi-core compute hosts and provides access to their resourc...
Definition: BareMetalComputeService.h:45
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:64
wrench::BatchComputeService
A batch_standard_and_pilot_jobs-scheduled compute service that manages a set of compute hosts and con...
Definition: BatchComputeService.h:49
wrench::BareMetalComputeService::supportsPilotJobs
virtual bool supportsPilotJobs() override
Returns true if the service supports pilot jobs.
Definition: BareMetalComputeService.cpp:975
wrench::ComputeServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:31
wrench::ComputeServiceMessagePayload::STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:33
wrench
Definition: Action.cpp:28
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:35
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:56
wrench::ComputeServiceMessagePayload::COMPOUND_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string COMPOUND_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running compound job ha...
Definition: ComputeServiceMessagePayload.h:47
wrench::BareMetalComputeService::BareMetalComputeService
BareMetalComputeService(const std::string &hostname, const std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::string scratch_space_mount_point, WRENCH_PROPERTY_COLLECTION_TYPE property_list={}, WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list={})
Constructor.
Definition: BareMetalComputeService.cpp:250
wrench::parseResourceSpec
static std::tuple< std::string, unsigned long > parseResourceSpec(const std::string &spec)
Helper static method to parse resource specifications to the <cores,ram> format.
Definition: ActionExecutionService.cpp:76
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:76
wrench::ServiceMessagePayload::STOP_DAEMON_MESSAGE_PAYLOAD
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:35
wrench::ComputeServiceMessagePayload::PILOT_JOB_STARTED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:58
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:29
wrench::ComputeServiceMessagePayload::JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:54
wrench::ComputeServiceMessagePayload::SUBMIT_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:41
wrench::BareMetalComputeServiceProperty::TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
static const std::string TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
If true, service will terminate whenever all resources are down.
Definition: BareMetalComputeServiceProperty.h:32
wrench::ComputeServiceMessagePayload::PILOT_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed.
Definition: ComputeServiceMessagePayload.h:62
wrench::ComputeServiceMessagePayload::TERMINATE_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a compound job.
Definition: ComputeServiceMessagePayload.h:49
wrench::BareMetalComputeServiceProperty::FAIL_ACTION_AFTER_ACTION_EXECUTOR_CRASH
static const std::string FAIL_ACTION_AFTER_ACTION_EXECUTOR_CRASH
If true, fail action after an executor crash, otherwise re-ready it and try again.
Definition: BareMetalComputeServiceProperty.h:30
wrench::ComputeServiceMessagePayload::TERMINATE_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job terminati...
Definition: ComputeServiceMessagePayload.h:51