WRENCH  1.11
Cyberinfrastructure Simulation Workbench
Overview Installation Getting Started WRENCH 101 WRENCH 102
BatchComputeService.h
1 
11 #ifndef WRENCH_BATCH_SERVICE_H
12 #define WRENCH_BATCH_SERVICE_H
13 
14 #include "wrench/services/compute/ComputeService.h"
15 #include "wrench/services/compute/batch/BatchJob.h"
16 #include "wrench/services/compute/batch/BatschedNetworkListener.h"
17 #include "wrench/services/compute/batch/BatchComputeServiceProperty.h"
18 #include "wrench/services/compute/batch/BatchComputeServiceMessagePayload.h"
19 #include "wrench/services/helper_services/alarm/Alarm.h"
20 #include "wrench/job/CompoundJob.h"
21 #include "wrench/job/Job.h"
22 #include "wrench/services/compute/batch/batch_schedulers/BatchScheduler.h"
23 
24 #include <deque>
25 #include <queue>
26 #include <set>
27 #include <tuple>
28 
29 namespace wrench {
30 
31  class WorkloadTraceFileReplayer;
32  class BareMetalComputeServiceOneShot;
33 
50 
54  private:
55 
56  WRENCH_PROPERTY_COLLECTION_TYPE default_property_values = {
60 #ifdef ENABLE_BATSCHED
62 // {BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM, "easy_bf"},
63 // {BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM, "easy_bf_fast"},
64 
66 #else
68 #endif
78  };
79 
80 WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE default_messagepayload_values = {
105  };
106 
107  public:
108  BatchComputeService(const std::string &hostname,
109  std::vector<std::string> compute_hosts,
110  std::string scratch_space_mount_point,
111  WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
112  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
113  );
114 
115  bool supportsStandardJobs() override;
116  bool supportsCompoundJobs() override;
117  bool supportsPilotJobs() override;
118 
119  /***********************/
121  /***********************/
122  std::map<std::string,double> getStartTimeEstimates(std::set<std::tuple<std::string,unsigned long,unsigned long, double>> resources);
123 
124  std::vector<std::tuple<std::string, std::string, int, int, int, double, double>> getQueue();
125 
126  /***********************/
128  /***********************/
129 
130  /***********************/
132  /***********************/
133  ~BatchComputeService() override;
134  // helper function
135  static unsigned long parseUnsignedLongServiceSpecificArgument(std::string key, const std::map<std::string, std::string> &args);
136 
137  void validateServiceSpecificArguments(std::shared_ptr<CompoundJob> compound_job,
138  std::map<std::string, std::string> &service_specific_args) override;
139 
140  /***********************/
142  /***********************/
143 
144  private:
145  friend class WorkloadTraceFileReplayer;
146  friend class FCFSBatchScheduler;
147  friend class ConservativeBackfillingBatchScheduler;
148  friend class ConservativeBackfillingBatchSchedulerCoreLevel;
149 
150  friend class BatschedBatchScheduler;
151 
152  BatchComputeService(const std::string hostname,
153  std::vector<std::string> compute_hosts,
154  unsigned long cores_per_host,
155  double ram_per_host,
156  std::string scratch_space_mount_point,
157  WRENCH_PROPERTY_COLLECTION_TYPE property_list,
158  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list,
159  std::string suffix
160  );
161 
162  //submits a standard job
163  void submitCompoundJob(std::shared_ptr<CompoundJob> job, const std::map<std::string, std::string> &batch_job_args) override;
164 
165  // terminate a standard job
166  void terminateCompoundJob(std::shared_ptr<CompoundJob> job) override;
167 
168  std::vector<std::tuple<std::string, double, double, double, double, unsigned int, std::string>> workload_trace;
169  std::shared_ptr<WorkloadTraceFileReplayer> workload_trace_replayer;
170 
171  bool clean_exit = false;
172 
173  //create alarms for compound jobs
174  std::map<std::shared_ptr<CompoundJob>,std::shared_ptr<Alarm>> compound_job_alarms;
175 
176  /* Resources information in batch_standard_and_pilot_jobs */
177  unsigned long total_num_of_nodes;
178  unsigned long num_cores_per_node;
179  std::map<std::string, unsigned long> nodes_to_cores_map;
180  std::vector<double> timeslots;
181  std::map<std::string, unsigned long> available_nodes_to_cores;
182  std::map<unsigned long, std::string> host_id_to_names;
183  std::vector<std::string> compute_hosts;
184  /* End Resources information in batch_standard_and_pilot_jobs */
185 
186  // Vector of one-shot bare-metal compute services
187  std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BareMetalComputeServiceOneShot>> running_bare_metal_one_shot_compute_services;
188 
189  // Master List of batch_standard_and_pilot_jobs jobs
190  std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BatchJob>> all_jobs;
191 
192  //A set of running batch_standard_and_pilot_jobs jobs
193  std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BatchJob>> running_jobs;
194 
195  // The batch_standard_and_pilot_jobs queue
196  std::deque<std::shared_ptr<BatchJob>> batch_queue;
197 
198  // A set of "waiting" batch_standard_and_pilot_jobs jobs, i.e., jobs that are waiting to be sent to
199  // the scheduler (useful for batsched only)
200  std::set<std::shared_ptr<BatchJob>> waiting_jobs;
201 
202  // Scheduler
203  std::unique_ptr<BatchScheduler> scheduler;
204 
205 
206 #ifdef ENABLE_BATSCHED
207 
208  std::set<std::string> scheduling_algorithms = {"conservative_bf", "crasher", "easy_bf", "easy_bf_fast",
209  "easy_bf_plot_liquid_load_horizon",
210  "energy_bf", "energy_bf_dicho", "energy_bf_idle_sleeper",
211  "energy_bf_monitoring",
212  "energy_bf_monitoring_inertial", "energy_bf_subpart_sleeper",
213  "energy_watcher", "fcfs_fast", "fast_conservative_bf",
214  "filler", "killer", "killer2", "random", "rejecter",
215  "sequencer", "sleeper", "submitter", "waiting_time_estimator"
216  };
217 
218  std::set<std::string> queue_ordering_options = {"fcfs", "lcfs", "desc_bounded_slowdown", "desc_slowdown",
219  "asc_size", "desc_size", "asc_walltime", "desc_walltime"
220 
221  };
222 #else
223  std::set<std::string> scheduling_algorithms = {"fcfs", "conservative_bf", "conservative_bf_core_level"
224  };
225 
226  //Batch queue ordering options
227  std::set<std::string> queue_ordering_options = {
228  };
229 
230 #endif
231 
232  unsigned long generateUniqueJobID();
233 
234  void removeJobFromRunningList(std::shared_ptr<BatchJob> job);
235 
236  void removeJobFromBatchQueue(std::shared_ptr<BatchJob> job);
237 
238  void removeBatchJobFromJobsList(std::shared_ptr<BatchJob> job);
239 
240  int main() override;
241 
242  bool processNextMessage();
243 
244  void startBackgroundWorkloadProcess();
245 
246  void processGetResourceInformation(simgrid::s4u::Mailbox *answer_mailbox, const std::string &key);
247 
248  void processCompoundJobCompletion(std::shared_ptr<BareMetalComputeServiceOneShot> executor, std::shared_ptr<CompoundJob> job);
249 
250  void processCompoundJobFailure(std::shared_ptr<BareMetalComputeServiceOneShot> executor,
251  std::shared_ptr<CompoundJob> job,
252  std::shared_ptr<FailureCause> cause);
253 
254  void terminateRunningCompoundJob(std::shared_ptr<CompoundJob> job, ComputeService::TerminationCause termination_cause);
255 
256 
257  //Terminate the batch_standard_and_pilot_jobs service (this is usually for pilot jobs when they act as a batch_standard_and_pilot_jobs service)
258  void cleanup(bool has_returned_from_main, int return_value) override;
259 
260  // Terminate
261  void terminate(bool send_failure_notifications, ComputeService::TerminationCause termination_cause);
262 
263  //Process standard job timeout
264  void processCompoundJobTimeout(std::shared_ptr<CompoundJob> job);
265 
266  //process standard job termination request
267  void processCompoundJobTerminationRequest(std::shared_ptr<CompoundJob> job, simgrid::s4u::Mailbox *answer_mailbox);
268 
269  // process a batch_standard_and_pilot_jobs bach_job tiemout event
270  void processAlarmJobTimeout(std::shared_ptr<BatchJob>bach_job);
271 
272  //free up resources
273  void freeUpResources(std::map<std::string, std::tuple<unsigned long, double>> resources);
274 
275  //send call back to the pilot job submitters
276  void sendPilotJobExpirationNotification(std::shared_ptr<PilotJob> job);
277 
278  //send call back to the standard job submitters
279  void sendCompoundJobFailureNotification(std::shared_ptr<CompoundJob> job, std::string job_id, std::shared_ptr<FailureCause> cause);
280 
281  // process a job submission
282  void processJobSubmission(std::shared_ptr<BatchJob>job, simgrid::s4u::Mailbox *answer_mailbox);
283 
284  //start a job
285  void startJob(std::map<std::string, std::tuple<unsigned long, double>>, std::shared_ptr<CompoundJob> ,
286  std::shared_ptr<BatchJob>, unsigned long, unsigned long, unsigned long);
287 
288 
289  void processExecuteJobFromBatSched(std::string bat_sched_reply);
290 
291  void processIsThereAtLeastOneHostWithAvailableResources(simgrid::s4u::Mailbox *answer_mailbox, unsigned long num_cores, double ram);
292 
293  };
294 }
295 
296 
297 #endif //WRENCH_BATCH_SERVICE_H
wrench::BatchComputeServiceProperty::SIMULATE_COMPUTATION_AS_SLEEP
static const std::string SIMULATE_COMPUTATION_AS_SLEEP
Simulate computation as just a sleep instead of an actual compute thread. This is for scalability rea...
Definition: BatchComputeServiceProperty.h:142
wrench::ComputeServiceMessagePayload::PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:60
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask is one host has some resources a...
Definition: ComputeServiceMessagePayload.h:78
wrench::ComputeServiceMessagePayload::SUBMIT_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job submissio...
Definition: ComputeServiceMessagePayload.h:43
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:66
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:27
wrench::ComputeServiceMessagePayload::COMPOUND_JOB_DONE_MESSAGE_PAYLOAD
static const std::string COMPOUND_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a compou...
Definition: ComputeServiceMessagePayload.h:45
wrench::ServiceMessagePayload::DAEMON_STOPPED_MESSAGE_PAYLOAD
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:37
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:37
wrench::BatchComputeServiceProperty::IGNORE_INVALID_JOBS_IN_WORKLOAD_TRACE_FILE
static const std::string IGNORE_INVALID_JOBS_IN_WORKLOAD_TRACE_FILE
Whether, when simulating a workload trace file, to abort when there is an invalid job specification (...
Definition: BatchComputeServiceProperty.h:104
wrench::BatchComputeServiceProperty::USE_REAL_RUNTIMES_AS_REQUESTED_RUNTIMES_IN_WORKLOAD_TRACE_FILE
static const std::string USE_REAL_RUNTIMES_AS_REQUESTED_RUNTIMES_IN_WORKLOAD_TRACE_FILE
Whether, when simulating a workload trace file, to use the actual runtimes as requested runtimes (i....
Definition: BatchComputeServiceProperty.h:95
wrench::BatchComputeService::BatchComputeService
BatchComputeService(const std::string &hostname, std::vector< std::string > compute_hosts, std::string scratch_space_mount_point, WRENCH_PROPERTY_COLLECTION_TYPE property_list={}, WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list={})
Constructor.
Definition: BatchComputeService.cpp:58
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to request information on its resources...
Definition: ComputeServiceMessagePayload.h:74
wrench::ComputeServiceMessagePayload::IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message by the the daemon to state whether one host has some resou...
Definition: ComputeServiceMessagePayload.h:80
wrench::BatchComputeService::supportsCompoundJobs
bool supportsCompoundJobs() override
Returns true if the service supports compound jobs.
Definition: BatchComputeService.cpp:1462
wrench::BatchComputeServiceProperty::BATSCHED_CONTIGUOUS_ALLOCATION
static const std::string BATSCHED_CONTIGUOUS_ALLOCATION
Controls Batsched node allocation policy.
Definition: BatchComputeServiceProperty.h:160
wrench::BatchComputeServiceProperty::HOST_SELECTION_ALGORITHM
static const std::string HOST_SELECTION_ALGORITHM
The host selection algorithm. Can be:
Definition: BatchComputeServiceProperty.h:62
wrench::ComputeService
The compute service base class.
Definition: ComputeService.h:34
wrench::BatchComputeServiceProperty::SUBMIT_TIME_OF_FIRST_JOB_IN_WORKLOAD_TRACE_FILE
static const std::string SUBMIT_TIME_OF_FIRST_JOB_IN_WORKLOAD_TRACE_FILE
A specification of the submit time of the first job in a provided trace file.
Definition: BatchComputeServiceProperty.h:111
wrench::BatchComputeServiceProperty::SIMULATED_WORKLOAD_TRACE_FILE
static const std::string SIMULATED_WORKLOAD_TRACE_FILE
Path to a workload trace file to be replayed. The trace file can be be in the SWF format (see http://...
Definition: BatchComputeServiceProperty.h:85
wrench::ComputeServiceMessagePayload::TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:64
wrench::BatchComputeService::supportsStandardJobs
bool supportsStandardJobs() override
Returns true if the service supports standard jobs.
Definition: BatchComputeService.cpp:1454
wrench::BatchComputeService
A batch_standard_and_pilot_jobs-scheduled compute service that manages a set of compute hosts and con...
Definition: BatchComputeService.h:49
wrench::ComputeServiceMessagePayload::STANDARD_JOB_DONE_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:31
wrench::ComputeServiceMessagePayload::STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:33
wrench
Definition: Action.cpp:28
wrench::ComputeServiceMessagePayload::TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:35
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:56
wrench::ComputeServiceMessagePayload::COMPOUND_JOB_FAILED_MESSAGE_PAYLOAD
static const std::string COMPOUND_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running compound job ha...
Definition: ComputeServiceMessagePayload.h:47
wrench::BatchComputeServiceProperty::TASK_STARTUP_OVERHEAD
static const std::string TASK_STARTUP_OVERHEAD
The overhead to start a task execution, in seconds.
Definition: BatchComputeServiceProperty.h:27
wrench::BatchComputeServiceProperty::BATSCHED_LOGGING_MUTED
static const std::string BATSCHED_LOGGING_MUTED
Controls Batsched logging.
Definition: BatchComputeServiceProperty.h:150
wrench::BatchComputeService::supportsPilotJobs
bool supportsPilotJobs() override
Returns true if the service supports pilot jobs.
Definition: BatchComputeService.cpp:1470
wrench::BatchComputeServiceProperty::TASK_SELECTION_ALGORITHM
static const std::string TASK_SELECTION_ALGORITHM
The algorithm to pick which ready computational task (within a standard job executed by the batch_sta...
Definition: BatchComputeServiceProperty.h:72
wrench::ComputeServiceMessagePayload::RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:76
wrench::BatchComputeServiceProperty::OUTPUT_CSV_JOB_LOG
static const std::string OUTPUT_CSV_JOB_LOG
Path to a to-be-generated Batsim-style CSV trace file (e.g. for b3atch schedule visualization purpose...
Definition: BatchComputeServiceProperty.h:122
wrench::ServiceMessagePayload::STOP_DAEMON_MESSAGE_PAYLOAD
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:35
wrench::ComputeServiceMessagePayload::PILOT_JOB_STARTED_MESSAGE_PAYLOAD
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:58
wrench::ComputeServiceMessagePayload::SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:29
wrench::BatchComputeServiceProperty::BATCH_SCHEDULING_ALGORITHM
static const std::string BATCH_SCHEDULING_ALGORITHM
The batch_standard_and_pilot_jobs scheduling algorithm. Can be:
Definition: BatchComputeServiceProperty.h:43
wrench::ComputeServiceMessagePayload::SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:54
wrench::BatchComputeServiceProperty::BATCH_RJMS_PADDING_DELAY
static const std::string BATCH_RJMS_PADDING_DELAY
Integral number of seconds that the Batch Scheduler adds to the runtime of each incoming job....
Definition: BatchComputeServiceProperty.h:132
wrench::ComputeServiceMessagePayload::SUBMIT_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string SUBMIT_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:41
wrench::BatchComputeServiceProperty::BATCH_QUEUE_ORDERING_ALGORITHM
static const std::string BATCH_QUEUE_ORDERING_ALGORITHM
The batch_standard_and_pilot_jobs queue ordering algorithm. Can be:
Definition: BatchComputeServiceProperty.h:52
wrench::ComputeServiceMessagePayload::TERMINATE_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
static const std::string TERMINATE_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a compound job.
Definition: ComputeServiceMessagePayload.h:49
wrench::ComputeServiceMessagePayload::TERMINATE_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
static const std::string TERMINATE_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job terminati...
Definition: ComputeServiceMessagePayload.h:51