BareMetalComputeService.h
1 
10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/compute/standard_job_executor/StandardJobExecutor.h"
18 #include "BareMetalComputeServiceProperty.h"
19 #include "BareMetalComputeServiceMessagePayload.h"
20 #include "wrench/services/compute/workunit_executor/Workunit.h"
21 #include "wrench/services/helpers/HostStateChangeDetector.h"
22 
23 
24 
25 namespace wrench {
26 
27  class Simulation;
28 
29  class StorageService;
30 
31  class FailureCause;
32 
33  class Alarm;
34 
35 
49 
50  friend class CloudComputeService;
51  friend class BatchComputeService;
52 
53  private:
54 
55  std::map<std::string, std::string> default_property_values = {
60  };
61 
62  std::map<std::string, double> default_messagepayload_values = {
82  };
83 
84  public:
85 
86  // Public Constructor
87  BareMetalComputeService(const std::string &hostname,
88  const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
89  std::string scratch_space_mount_point,
90  std::map<std::string, std::string> property_list = {},
91  std::map<std::string, double> messagepayload_list = {}
92  );
93 
94  // Public Constructor
95  BareMetalComputeService(const std::string &hostname,
96  const std::vector<std::string> compute_hosts,
97  std::string scratch_space_mount_point,
98  std::map<std::string, std::string> property_list = {},
99  std::map<std::string, double> messagepayload_list = {}
100  );
101 
102 
103  /***********************/
105  /***********************/
106 
107  void submitStandardJob(StandardJob *job, std::map<std::string, std::string> &service_specific_args) override;
108 
109  void submitPilotJob(PilotJob *job, std::map<std::string, std::string> &service_specific_args) override;
110 
111  void terminateStandardJob(StandardJob *job) override;
112 
113  void terminatePilotJob(PilotJob *job) override;
114 
116 
117  /***********************/
119  /***********************/
120 
121  private:
122 
123  friend class Simulation;
124 
125  void validateProperties();
126 
127  // Low-level Constructor
128  BareMetalComputeService(const std::string &hostname,
129  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
130  std::map<std::string, std::string> property_list,
131  std::map<std::string, double> messagepayload_list,
132  double ttl,
133  PilotJob *pj, std::string suffix,
134  std::shared_ptr<StorageService> scratch_space); // reference to upper level scratch space
135 
136  // Private Constructor
137  BareMetalComputeService(const std::string &hostname,
138  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
139  std::map<std::string, std::string> property_list,
140  std::map<std::string, double> messagepayload_list,
141  std::shared_ptr<StorageService> scratch_space);
142 
143  // Low-level constructor helper method
144  void initiateInstance(const std::string &hostname,
145  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
146  std::map<std::string, std::string> property_list,
147  std::map<std::string, double> messagepayload_list,
148  double ttl,
149  PilotJob *pj);
150 
151 
152  std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
153 
154  // Core availabilities (for each hosts, how many cores and how many bytes of RAM are currently available on it)
155  std::map<std::string, double> ram_availabilities;
156  std::map<std::string, unsigned long> running_thread_counts;
157 
158  unsigned long total_num_cores;
159 
160  double ttl;
161  bool has_ttl;
162  double death_date;
163  std::shared_ptr<Alarm> death_alarm = nullptr;
164 
165  PilotJob *containing_pilot_job; // In case this service is in fact a pilot job
166 
167  std::map<StandardJob *, std::set<WorkflowFile*>> files_in_scratch;
168 
169  // Set of running jobs
170  std::set<StandardJob *> running_jobs;
171 
172  // Job task execution specs
173  std::map<StandardJob *, std::map<WorkflowTask *, std::tuple<std::string, unsigned long>>> job_run_specs;
174 
175  // Map of all Workunits
176  std::map<StandardJob *, std::set<std::shared_ptr<Workunit>>> all_workunits;
177 
178  std::deque<std::shared_ptr<Workunit>> ready_workunits;
179 // std::map<StandardJob *, std::set<Workunit *>> running_workunits;
180  std::map<StandardJob *, std::set<std::shared_ptr<Workunit>>> completed_workunits;
181 
182  // Set of running WorkunitExecutors
183  std::map<StandardJob *, std::set<std::shared_ptr<WorkunitExecutor>>> workunit_executors;
184 
185  // Add the scratch files of one standardjob to the list of all the scratch files of all the standard jobs inside the pilot job
186  void storeFilesStoredInScratch(std::set<WorkflowFile*> scratch_files);
187 
188  // Cleanup the scratch if I am a pilot job
189  void cleanUpScratch();
190 
191  int main() override;
192 
193  // Helper functions to make main() a bit more palatable
194 
195  void terminate(bool notify_pilot_job_submitters);
196 
197  void failCurrentStandardJobs();
198 
199  void processWorkunitExecutorCompletion(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit);
200 
201  void processWorkunitExecutorFailure(std::shared_ptr<WorkunitExecutor> workunit_executor, std::shared_ptr<Workunit> workunit, std::shared_ptr<FailureCause> cause);
202 
203  void processWorkunitExecutorCrash(std::shared_ptr<WorkunitExecutor> workunit_executor);
204 
205  void forgetWorkunitExecutor(std::shared_ptr<WorkunitExecutor> workunit_executor);
206 
207  void processStandardJobTerminationRequest(StandardJob *job, const std::string &answer_mailbox);
208 
209  bool processNextMessage();
210 
211  void dispatchReadyWorkunits();
212 
213 // void someHostIsBackOn(simgrid::s4u::Host const &h);
214 // bool host_back_on = false;
215 
216 
218  enum JobTerminationCause {
220  TERMINATED,
221 
223  COMPUTE_SERVICE_KILLED
224  };
225 
226  void terminateRunningStandardJob(StandardJob *job, JobTerminationCause termination_cause);
227 
228  void failRunningStandardJob(StandardJob *job, std::shared_ptr<FailureCause> cause);
229 
230  void processGetResourceInformation(const std::string &answer_mailbox);
231 
232  void processSubmitPilotJob(const std::string &answer_mailbox, PilotJob *job, std::map<std::string, std::string> service_specific_args);
233 
234  void processSubmitStandardJob(const std::string &answer_mailbox, StandardJob *job,
235  std::map<std::string, std::string> &service_specific_arguments);
236 
237  std::tuple<std::string, unsigned long> pickAllocation(WorkflowTask *task,
238  std::string required_host, unsigned long required_num_cores, double required_ram,
239  std::set<std::string> &hosts_to_avoid);
240 
241  bool jobCanRun(StandardJob *job, std::map<std::string, std::string> &service_specific_arguments);
242 
243  bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
244 
245  void cleanup(bool has_terminated_cleanly, int return_value) override;
246 
247  bool areAllComputeResourcesDownWithNoWUERunning();
248 
249 
250  int exit_code = 0;
251 
252  std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
253 
254  };
255 };
256 
257 
258 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:44
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:54
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:26
static const std::string NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not have sufficie...
Definition: BareMetalComputeServiceMessagePayload.h:28
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:33
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:36
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask it for information on its resour...
Definition: ComputeServiceMessagePayload.h:56
static const std::string TASK_STARTUP_OVERHEAD
The overhead to start a thread, in seconds.
Definition: BareMetalComputeServiceProperty.h:28
The compute service base class.
Definition: ComputeService.h:35
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudComputeService.h:37
A compute service that manages a set of multi-core compute hosts and provides access to their resourc...
Definition: BareMetalComputeService.h:48
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:52
A batch-scheduled compute service that manages a set of compute hosts and controls access to their re...
Definition: BatchComputeService.h:49
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:30
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:32
Definition: Alarm.cpp:20
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:34
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:40
A computational task in a Workflow.
Definition: WorkflowTask.h:27
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:58
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:31
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:42
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:28
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:45
BareMetalComputeService(const std::string &hostname, const std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::string scratch_space_mount_point, std::map< std::string, std::string > property_list={}, std::map< std::string, double > messagepayload_list={})
Constructor.
Definition: BareMetalComputeService.cpp:299
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:38
static const std::string SUPPORTS_PILOT_JOBS
Whether the compute service supports pilot jobs (true or false)
Definition: ComputeServiceProperty.h:26
static const std::string TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
Whether the service should terminate when all hosts are down.
Definition: BareMetalComputeServiceProperty.h:30
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed.
Definition: ComputeServiceMessagePayload.h:46
static const std::string SUPPORTS_STANDARD_JOBS
Whether the compute service supports standard jobs (true or false)
Definition: ComputeServiceProperty.h:24