BareMetalComputeService.h
1 
10 #ifndef WRENCH_BAREMETALCOMPUTESERVICE_H
11 #define WRENCH_BAREMETALCOMPUTESERVICE_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/compute/standard_job_executor/StandardJobExecutor.h"
18 #include "BareMetalComputeServiceProperty.h"
19 #include "BareMetalComputeServiceMessagePayload.h"
20 
21 namespace wrench {
22 
23  class Simulation;
24 
25  class StorageService;
26 
27  class FailureCause;
28 
29  class Alarm;
30 
31 
45 
46  friend class CloudService;
47  friend class BatchService;
48 
49  private:
50 
51  std::map<std::string, std::string> default_property_values = {
55  };
56 
57  std::map<std::string, std::string> default_messagepayload_values = {
77  };
78 
79  public:
80 
81  // Public Constructor
82  BareMetalComputeService(const std::string &hostname,
83  const std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
84  double scratch_space_size,
85  std::map<std::string, std::string> property_list = {},
86  std::map<std::string, std::string> messagepayload_list = {}
87  );
88 
89  // Public Constructor
90  BareMetalComputeService(const std::string &hostname,
91  const std::set<std::string> compute_hosts,
92  double scratch_space_size,
93  std::map<std::string, std::string> property_list = {},
94  std::map<std::string, std::string> messagepayload_list = {}
95  );
96 
97 
98  /***********************/
100  /***********************/
101 
102  void validateProperties();
103 
104  void submitStandardJob(StandardJob *job, std::map<std::string, std::string> &service_specific_args) override;
105 
106  void submitPilotJob(PilotJob *job, std::map<std::string, std::string> &service_specific_args) override;
107 
108  void terminateStandardJob(StandardJob *job) override;
109 
110  void terminatePilotJob(PilotJob *job) override;
111 
113 
114  /***********************/
116  /***********************/
117 
118  private:
119 
120  friend class Simulation;
121 
122  // Low-level Constructor
123  BareMetalComputeService(const std::string &hostname,
124  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
125  std::map<std::string, std::string> property_list,
126  std::map<std::string, std::string> messagepayload_list,
127  double ttl,
128  PilotJob *pj, std::string suffix,
129  StorageService* scratch_space); // reference to upper level scratch space
130 
131  // Private Constructor
132  BareMetalComputeService(const std::string &hostname,
133  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
134  std::map<std::string, std::string> property_list,
135  std::map<std::string, std::string> messagepayload_list,
136  StorageService* scratch_space);
137 
138  // Low-level constructor helper method
139  void initiateInstance(const std::string &hostname,
140  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
141  std::map<std::string, std::string> property_list,
142  std::map<std::string, std::string> messagepayload_list,
143  double ttl,
144  PilotJob *pj);
145 
146 
147  std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
148 
149  // Core availabilities (for each hosts, how many cores and how many bytes of RAM are currently available on it)
150  std::map<std::string, double> ram_availabilities;
151  std::map<std::string, unsigned long> running_thread_counts;
152 
153  unsigned long total_num_cores;
154 
155  double ttl;
156  bool has_ttl;
157  double death_date;
158  std::shared_ptr<Alarm> death_alarm = nullptr;
159 
160  PilotJob *containing_pilot_job; // In case this service is in fact a pilot job
161 
162  std::map<StandardJob *, std::set<WorkflowFile*>> files_in_scratch;
163 
164  // Set of running jobs
165  std::set<StandardJob *> running_jobs;
166 
167  // Job task execution specs
168  std::map<StandardJob *, std::map<WorkflowTask *, std::tuple<std::string, unsigned long>>> job_run_specs;
169 
170  // Map of all Workunits
171  std::map<StandardJob *, std::set<std::unique_ptr<Workunit>>> all_workunits;
172 
173  std::deque<Workunit *> ready_workunits;
174 // std::map<StandardJob *, std::set<Workunit *>> running_workunits;
175  std::map<StandardJob *, std::set<Workunit *>> completed_workunits;
176 
177  // Set of running WorkunitExecutors
178  std::map<StandardJob *, std::set<std::shared_ptr<WorkunitExecutor>>> workunit_executors;
179 
180  // Add the scratch files of one standardjob to the list of all the scratch files of all the standard jobs inside the pilot job
181  void storeFilesStoredInScratch(std::set<WorkflowFile*> scratch_files);
182 
183  // Cleanup the scratch if I am a pilot job
184  void cleanUpScratch();
185 
186  int main() override;
187 
188  // Helper functions to make main() a bit more palatable
189 
190  void terminate(bool notify_pilot_job_submitters);
191 
192  void failCurrentStandardJobs();
193 
194  void processWorkunitExecutorCompletion(WorkunitExecutor *workunit_executor, Workunit *workunit);
195 
196  void processWorkunitExecutorFailure(WorkunitExecutor *workunit_executor, Workunit *workunit, std::shared_ptr<FailureCause> cause);
197 
198  void processWorkunitExecutorCrash(WorkunitExecutor *workunit_executor);
199 
200  void forgetWorkunitExecutor(WorkunitExecutor *workunit_executor);
201 
202 
203  void processStandardJobTerminationRequest(StandardJob *job, const std::string &answer_mailbox);
204 
205  bool processNextMessage();
206 
207  void dispatchReadyWorkunits();
208 
209 
211  enum JobTerminationCause {
213  TERMINATED,
214 
216  COMPUTE_SERVICE_KILLED
217  };
218 
219  void terminateRunningStandardJob(StandardJob *job, JobTerminationCause termination_cause);
220 
221  void failRunningStandardJob(StandardJob *job, std::shared_ptr<FailureCause> cause);
222 
223  void processGetResourceInformation(const std::string &answer_mailbox);
224 
225  void processSubmitPilotJob(const std::string &answer_mailbox, PilotJob *job, std::map<std::string, std::string> service_specific_args);
226 
227  void processSubmitStandardJob(const std::string &answer_mailbox, StandardJob *job,
228  std::map<std::string, std::string> &service_specific_arguments);
229 
230  std::tuple<std::string, unsigned long> pickAllocation(WorkflowTask *task,
231  std::string required_host, unsigned long required_num_cores, double required_ram,
232  std::set<std::string> &hosts_to_avoid);
233 
234  bool jobCanRun(StandardJob *job, std::map<std::string, std::string> &service_specific_arguments);
235 
236  bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
237  };
238 };
239 
240 
241 #endif //WRENCH_BAREMETALCOMPUTESERVICE_H
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired...
Definition: ComputeServiceMessagePayload.h:44
A batch-scheduled compute service that manages a set of compute hosts and controls access to their re...
Definition: BatchService.h:47
BareMetalComputeService(const std::string &hostname, const std::map< std::string, std::tuple< unsigned long, double >> compute_resources, double scratch_space_size, std::map< std::string, std::string > property_list={}, std::map< std::string, std::string > messagepayload_list={})
Constructor.
Definition: BareMetalComputeService.cpp:273
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:31
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job...
Definition: ComputeServiceMessagePayload.h:34
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination...
Definition: ComputeServiceMessagePayload.h:54
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:40
A computational task in a Workflow.
Definition: WorkflowTask.h:26
static const std::string PILOT_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has failed...
Definition: ComputeServiceMessagePayload.h:46
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:28
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job...
Definition: ComputeServiceMessagePayload.h:52
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:32
static const std::string JOB_TYPE_NOT_SUPPORTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not support the t...
Definition: ComputeServiceMessagePayload.h:24
static const std::string NOT_ENOUGH_CORES_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it does not have sufficie...
Definition: BareMetalComputeServiceMessagePayload.h:28
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:36
A class that provides basic simulation methods.
Definition: Simulation.h:34
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:30
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:38
The compute service base class.
Definition: ComputeService.h:35
A cloud-based compute service that manages a set of physical hosts and controls access to their resou...
Definition: CloudService.h:37
static const std::string SUPPORTS_PILOT_JOBS
Whether the compute service supports pilot jobs (true or false)
Definition: ComputeServiceProperty.h:26
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask it for information on its resour...
Definition: ComputeServiceMessagePayload.h:56
static const std::string SUPPORTS_STANDARD_JOBS
Whether the compute service supports standard jobs (true or false)
Definition: ComputeServiceProperty.h:24
The storage service base class.
Definition: StorageService.h:35
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job...
Definition: ComputeServiceMessagePayload.h:26
A compute service that manages a set of multi-core compute hosts and provides access to their resourc...
Definition: BareMetalComputeService.h:44
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started...
Definition: ComputeServiceMessagePayload.h:42
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:33
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources...
Definition: ComputeServiceMessagePayload.h:58
Definition: TerminalOutput.cpp:15
static const std::string THREAD_STARTUP_OVERHEAD
The overhead to start a thread, in seconds.
Definition: BareMetalComputeServiceProperty.h:28