WRENCH  1.10
Cyberinfrastructure Simulation Workbench
Overview Installation Getting Started WRENCH 101 WRENCH 102
BareMetalComputeService.h
1 
10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/compute/standard_job_executor/StandardJobExecutor.h"
18 #include "BareMetalComputeServiceProperty.h"
19 #include "BareMetalComputeServiceMessagePayload.h"
20 #include "wrench/services/compute/workunit_executor/Workunit.h"
21 #include "wrench/services/helpers/HostStateChangeDetector.h"
22 
23 
24 
25 namespace wrench {
26 
27  class Simulation;
28 
29  class StorageService;
30 
31  class FailureCause;
32 
33  class Alarm;
34 
35 
49 
50  friend class CloudComputeService;
51  friend class BatchComputeService;
52 
53  private:
54 
55  std::map<std::string, std::string> default_property_values = {
60  };
61 
62  std::map<std::string, double> default_messagepayload_values = {
84  };
85 
86  public:
87 
88  // Public Constructor
89  BareMetalComputeService(const std::string &hostname,
90  const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
91  std::string scratch_space_mount_point,
92  std::map<std::string, std::string> property_list = {},
93  std::map<std::string, double> messagepayload_list = {}
94  );
95 
96  // Public Constructor
97  BareMetalComputeService(const std::string &hostname,
98  const std::vector<std::string> compute_hosts,
99  std::string scratch_space_mount_point,
100  std::map<std::string, std::string> property_list = {},
101  std::map<std::string, double> messagepayload_list = {}
102  );
103 
104  /***********************/
106  /***********************/
107 
108  void submitStandardJob(std::shared_ptr<StandardJob> job, const std::map<std::string, std::string> &service_specific_args) override;
109 
110  void submitPilotJob(std::shared_ptr<PilotJob> job, const std::map<std::string, std::string> &service_specific_args) override;
111 
112  void terminateStandardJob(std::shared_ptr<StandardJob> job) override;
113 
114  void terminatePilotJob(std::shared_ptr<PilotJob> job) override;
115 
117 
118  /***********************/
120  /***********************/
121 
122  private:
123 
124  friend class Simulation;
125 
126  void validateProperties();
127 
128  // Low-level Constructor
129  BareMetalComputeService(const std::string &hostname,
130  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
131  std::map<std::string, std::string> property_list,
132  std::map<std::string, double> messagepayload_list,
133  double ttl,
134  std::shared_ptr<PilotJob> pj, std::string suffix,
135  std::shared_ptr<StorageService> scratch_space); // reference to upper level scratch space
136 
137  // Private Constructor
138  BareMetalComputeService(const std::string &hostname,
139  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
140  std::map<std::string, std::string> property_list,
141  std::map<std::string, double> messagepayload_list,
142  std::shared_ptr<StorageService> scratch_space);
143 
144  // Low-level constructor helper method
145  void initiateInstance(const std::string &hostname,
146  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
147  std::map<std::string, std::string> property_list,
148  std::map<std::string, double> messagepayload_list,
149  double ttl,
150  std::shared_ptr<PilotJob> pj);
151 
152 
153  std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
154 
155  // Core availabilities (for each hosts, how many cores and how many bytes of RAM are currently available on it)
156  std::map<std::string, double> ram_availabilities;
157  std::map<std::string, unsigned long> running_thread_counts;
158 
159  unsigned long total_num_cores;
160 
161  double ttl;
162  bool has_ttl;
163  double death_date;
164  std::shared_ptr<Alarm> death_alarm = nullptr;
165 
166  std::shared_ptr<PilotJob> containing_pilot_job; // In case this service is in fact a pilot job
167 
168  std::map<std::shared_ptr<StandardJob> , std::set<WorkflowFile*>> files_in_scratch;
169 
170  // Set of running jobs
171  std::set<std::shared_ptr<StandardJob> > running_jobs;
172 
173  // Job task execution specs
174  std::map<std::shared_ptr<StandardJob> , std::map<WorkflowTask *, std::tuple<std::string, unsigned long>>> job_run_specs;
175 
176  // Map of all Workunits
177  std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<Workunit>>> all_workunits;
178 
179  std::deque<std::shared_ptr<Workunit>> ready_workunits;
180 // std::map<std::shared_ptr<StandardJob> , std::set<Workunit *>> running_workunits;
181  std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<Workunit>>> completed_workunits;
182 
183  // Set of running WorkunitExecutors
184  std::map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<WorkunitExecutor>>> workunit_executors;
185 
186  // Add the scratch files of one standardjob to the list of all the scratch files of all the standard jobs inside the pilot job
187  void storeFilesStoredInScratch(std::set<WorkflowFile*> scratch_files);
188 
189  // Cleanup the scratch if I am a pilot job
190  void cleanUpScratch();
191 
192  int main() override;
193 
194  // Helper functions to make main() a bit more palatable
195 
196  void terminate(bool notify_pilot_job_submitters);
197 
198  void failCurrentStandardJobs();
199 
200  void processWorkunitExecutorCompletion(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit);
201 
202  void processWorkunitExecutorFailure(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit, std::shared_ptr<FailureCause> cause);
203 
204  void processWorkunitExecutorCrash(std::shared_ptr<WorkunitExecutor> workunit_executor);
205 
206  void forgetWorkunitExecutor(std::shared_ptr<WorkunitExecutor> workunit_executor);
207 
208  void processStandardJobTerminationRequest(std::shared_ptr<StandardJob> job, const std::string &answer_mailbox);
209 
210  bool processNextMessage();
211 
212  void dispatchReadyWorkunits();
213 
214 // void someHostIsBackOn(simgrid::s4u::Host const &h);
215 // bool host_back_on = false;
216 
217 
219  enum JobTerminationCause {
221  TERMINATED,
222 
224  COMPUTE_SERVICE_KILLED
225  };
226 
227  void terminateRunningStandardJob(std::shared_ptr<StandardJob> job, JobTerminationCause termination_cause);
228 
229  void failRunningStandardJob(std::shared_ptr<StandardJob> job, std::shared_ptr<FailureCause> cause);
230 
231  void processGetResourceInformation(const std::string &answer_mailbox);
232 
233  void processSubmitPilotJob(const std::string &answer_mailbox, std::shared_ptr<PilotJob> job, std::map<std::string, std::string> service_specific_args);
234 
235  void processSubmitStandardJob(const std::string &answer_mailbox, std::shared_ptr<StandardJob> job,
236  std::map<std::string, std::string> &service_specific_arguments);
237 
238  void processIsThereAtLeastOneHostWithAvailableResources(
239  const std::string &answer_mailbox, unsigned long num_cores, double ram);
240 
241  std::tuple<std::string, unsigned long> pickAllocation(WorkflowTask *task,
242  std::string required_host, unsigned long required_num_cores, double required_ram,
243  std::set<std::string> &hosts_to_avoid);
244 
245  bool jobCanRun(std::shared_ptr<StandardJob> job, std::map<std::string, std::string> &service_specific_arguments);
246 
247  bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
248 
249  void cleanup(bool has_terminated_cleanly, int return_value) override;
250 
251  bool areAllComputeResourcesDownWithNoWUERunning();
252 
253 
254  int exit_code = 0;
255 
256  std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
257 
258  };
259 };
260 
261 
262 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
wrench::ComputeServiceMessagePayload::PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:44
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask is one host has some resources a...
Definition: ComputeServiceMessagePayload.h:60
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:54
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:26
wrench::BareMetalComputeServiceMessagePayload::NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
static const std::string NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not have sufficie...
Definition: BareMetalComputeServiceMessagePayload.h:28
wrench::ServiceMessagePayload::DAEMON_STOPPED_MESSAGE_PAYLOAD
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:33
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:36
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask it for information on its resour...
Definition: ComputeServiceMessagePayload.h:56
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message by the the daemon to state whether one host has some resou...
Definition: ComputeServiceMessagePayload.h:62
wrench::BareMetalComputeServiceProperty::TASK_STARTUP_OVERHEAD
static const std::string TASK_STARTUP_OVERHEAD
The overhead to start a thread, in seconds.
Definition: BareMetalComputeServiceProperty.h:28
wrench::ComputeService
The compute service base class.
Definition: ComputeService.h:33
wrench::CloudComputeService
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudComputeService.h:36
wrench::BareMetalComputeService
A compute service that manages a set of multi-core compute hosts and provides access to their resourc...
Definition: BareMetalComputeService.h:48
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:52
wrench::BatchComputeService
A batch-scheduled compute service that manages a set of compute hosts and controls access to their re...
Definition: BatchComputeService.h:49
wrench::ComputeServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:30
wrench::ComputeServiceMessagePayload::STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:32
wrench
Definition: Alarm.cpp:20
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:34
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:40
wrench::WorkflowTask
A computational task in a Workflow.
Definition: WorkflowTask.h:31
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:58
wrench::ServiceMessagePayload::STOP_DAEMON_MESSAGE_PAYLOAD
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:31
wrench::ComputeServiceMessagePayload::PILOT_JOB_STARTED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:42
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:28
wrench::Simulation
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:46
wrench::BareMetalComputeService::BareMetalComputeService
BareMetalComputeService(const std::string &hostname, const std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::string scratch_space_mount_point, std::map< std::string, std::string > property_list={}, std::map< std::string, double > messagepayload_list={})
Constructor.
Definition: BareMetalComputeService.cpp:290
wrench::ComputeServiceMessagePayload::JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:38
wrench::ComputeServiceProperty::SUPPORTS_PILOT_JOBS
static const std::string SUPPORTS_PILOT_JOBS
Whether the compute service supports pilot jobs (true or false)
Definition: ComputeServiceProperty.h:26
wrench::BareMetalComputeServiceProperty::TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
static const std::string TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
Whether the service should terminate when all hosts are down.
Definition: BareMetalComputeServiceProperty.h:30
wrench::ComputeServiceMessagePayload::PILOT_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed.
Definition: ComputeServiceMessagePayload.h:46
wrench::ComputeServiceProperty::SUPPORTS_STANDARD_JOBS
static const std::string SUPPORTS_STANDARD_JOBS
Whether the compute service supports standard jobs (true or false)
Definition: ComputeServiceProperty.h:24