11 #ifndef WRENCH_BATCH_SERVICE_H
12 #define WRENCH_BATCH_SERVICE_H
14 #include "wrench/services/compute/ComputeService.h"
15 #include "wrench/services/compute/batch/BatchJob.h"
16 #include "wrench/services/compute/batch/BatschedNetworkListener.h"
17 #include "wrench/services/compute/batch/BatchComputeServiceProperty.h"
18 #include "wrench/services/compute/batch/BatchComputeServiceMessagePayload.h"
19 #include "wrench/services/helper_services/alarm/Alarm.h"
20 #include "wrench/job/CompoundJob.h"
21 #include "wrench/job/Job.h"
22 #include "wrench/services/compute/batch/batch_schedulers/BatchScheduler.h"
31 class WorkloadTraceFileReplayer;
32 class BareMetalComputeServiceOneShot;
56 WRENCH_PROPERTY_COLLECTION_TYPE default_property_values = {
60 #ifdef ENABLE_BATSCHED
80 WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE default_messagepayload_values = {
109 std::vector<std::string> compute_hosts,
110 std::string scratch_space_mount_point,
111 WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
112 WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
122 std::map<std::string,double> getStartTimeEstimates(std::set<std::tuple<std::string,unsigned long,unsigned long, double>> resources);
124 std::vector<std::tuple<std::string, std::string, int, int, int, double, double>> getQueue();
135 static unsigned long parseUnsignedLongServiceSpecificArgument(std::string key,
const std::map<std::string, std::string> &args);
137 void validateServiceSpecificArguments(std::shared_ptr<CompoundJob> compound_job,
138 std::map<std::string, std::string> &service_specific_args)
override;
145 friend class WorkloadTraceFileReplayer;
146 friend class FCFSBatchScheduler;
147 friend class ConservativeBackfillingBatchScheduler;
148 friend class ConservativeBackfillingBatchSchedulerCoreLevel;
150 friend class BatschedBatchScheduler;
153 std::vector<std::string> compute_hosts,
154 unsigned long cores_per_host,
156 std::string scratch_space_mount_point,
157 WRENCH_PROPERTY_COLLECTION_TYPE property_list,
158 WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list,
163 void submitCompoundJob(std::shared_ptr<CompoundJob> job,
const std::map<std::string, std::string> &batch_job_args)
override;
166 void terminateCompoundJob(std::shared_ptr<CompoundJob> job)
override;
168 std::vector<std::tuple<std::string, double, double, double, double, unsigned int, std::string>> workload_trace;
169 std::shared_ptr<WorkloadTraceFileReplayer> workload_trace_replayer;
171 bool clean_exit =
false;
174 std::map<std::shared_ptr<CompoundJob>,std::shared_ptr<Alarm>> compound_job_alarms;
177 unsigned long total_num_of_nodes;
178 unsigned long num_cores_per_node;
179 std::map<std::string, unsigned long> nodes_to_cores_map;
180 std::vector<double> timeslots;
181 std::map<std::string, unsigned long> available_nodes_to_cores;
182 std::map<unsigned long, std::string> host_id_to_names;
183 std::vector<std::string> compute_hosts;
187 std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BareMetalComputeServiceOneShot>> running_bare_metal_one_shot_compute_services;
190 std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BatchJob>> all_jobs;
193 std::map<std::shared_ptr<CompoundJob>, std::shared_ptr<BatchJob>> running_jobs;
196 std::deque<std::shared_ptr<BatchJob>> batch_queue;
200 std::set<std::shared_ptr<BatchJob>> waiting_jobs;
203 std::unique_ptr<BatchScheduler> scheduler;
206 #ifdef ENABLE_BATSCHED
208 std::set<std::string> scheduling_algorithms = {
"conservative_bf",
"crasher",
"easy_bf",
"easy_bf_fast",
209 "easy_bf_plot_liquid_load_horizon",
210 "energy_bf",
"energy_bf_dicho",
"energy_bf_idle_sleeper",
211 "energy_bf_monitoring",
212 "energy_bf_monitoring_inertial",
"energy_bf_subpart_sleeper",
213 "energy_watcher",
"fcfs_fast",
"fast_conservative_bf",
214 "filler",
"killer",
"killer2",
"random",
"rejecter",
215 "sequencer",
"sleeper",
"submitter",
"waiting_time_estimator"
218 std::set<std::string> queue_ordering_options = {
"fcfs",
"lcfs",
"desc_bounded_slowdown",
"desc_slowdown",
219 "asc_size",
"desc_size",
"asc_walltime",
"desc_walltime"
223 std::set<std::string> scheduling_algorithms = {
"fcfs",
"conservative_bf",
"conservative_bf_core_level"
227 std::set<std::string> queue_ordering_options = {
232 unsigned long generateUniqueJobID();
234 void removeJobFromRunningList(std::shared_ptr<BatchJob> job);
236 void removeJobFromBatchQueue(std::shared_ptr<BatchJob> job);
238 void removeBatchJobFromJobsList(std::shared_ptr<BatchJob> job);
242 bool processNextMessage();
244 void startBackgroundWorkloadProcess();
246 void processGetResourceInformation(simgrid::s4u::Mailbox *answer_mailbox,
const std::string &key);
248 void processCompoundJobCompletion(std::shared_ptr<BareMetalComputeServiceOneShot> executor, std::shared_ptr<CompoundJob> job);
250 void processCompoundJobFailure(std::shared_ptr<BareMetalComputeServiceOneShot> executor,
251 std::shared_ptr<CompoundJob> job,
252 std::shared_ptr<FailureCause> cause);
254 void terminateRunningCompoundJob(std::shared_ptr<CompoundJob> job, ComputeService::TerminationCause termination_cause);
258 void cleanup(
bool has_returned_from_main,
int return_value)
override;
261 void terminate(
bool send_failure_notifications, ComputeService::TerminationCause termination_cause);
264 void processCompoundJobTimeout(std::shared_ptr<CompoundJob> job);
267 void processCompoundJobTerminationRequest(std::shared_ptr<CompoundJob> job, simgrid::s4u::Mailbox *answer_mailbox);
270 void processAlarmJobTimeout(std::shared_ptr<BatchJob>bach_job);
273 void freeUpResources(std::map<std::string, std::tuple<unsigned long, double>> resources);
276 void sendPilotJobExpirationNotification(std::shared_ptr<PilotJob> job);
279 void sendCompoundJobFailureNotification(std::shared_ptr<CompoundJob> job, std::string job_id, std::shared_ptr<FailureCause> cause);
282 void processJobSubmission(std::shared_ptr<BatchJob>job, simgrid::s4u::Mailbox *answer_mailbox);
285 void startJob(std::map<std::string, std::tuple<unsigned long, double>>, std::shared_ptr<CompoundJob> ,
286 std::shared_ptr<BatchJob>,
unsigned long,
unsigned long,
unsigned long);
289 void processExecuteJobFromBatSched(std::string bat_sched_reply);
291 void processIsThereAtLeastOneHostWithAvailableResources(simgrid::s4u::Mailbox *answer_mailbox,
unsigned long num_cores,
double ram);
297 #endif //WRENCH_BATCH_SERVICE_H
static const std::string SIMULATE_COMPUTATION_AS_SLEEP
Simulate computation as just a sleep instead of an actual compute thread. This is for scalability rea...
Definition: BatchComputeServiceProperty.h:142
static const std::string PILOT_JOB_EXPIRED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has expired.
Definition: ComputeServiceMessagePayload.h:60
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to ask is one host has some resources a...
Definition: ComputeServiceMessagePayload.h:78
static const std::string SUBMIT_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job submissio...
Definition: ComputeServiceMessagePayload.h:43
static const std::string TERMINATE_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a pilot job termination.
Definition: ComputeServiceMessagePayload.h:66
static const std::string SUBMIT_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a standard job.
Definition: ComputeServiceMessagePayload.h:27
static const std::string COMPOUND_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a compou...
Definition: ComputeServiceMessagePayload.h:45
static const std::string DAEMON_STOPPED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to confirm it has terminated.
Definition: ServiceMessagePayload.h:37
static const std::string TERMINATE_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job terminati...
Definition: ComputeServiceMessagePayload.h:37
static const std::string IGNORE_INVALID_JOBS_IN_WORKLOAD_TRACE_FILE
Whether, when simulating a workload trace file, to abort when there is an invalid job specification (...
Definition: BatchComputeServiceProperty.h:104
static const std::string USE_REAL_RUNTIMES_AS_REQUESTED_RUNTIMES_IN_WORKLOAD_TRACE_FILE
Whether, when simulating a workload trace file, to use the actual runtimes as requested runtimes (i....
Definition: BatchComputeServiceProperty.h:95
BatchComputeService(const std::string &hostname, std::vector< std::string > compute_hosts, std::string scratch_space_mount_point, WRENCH_PROPERTY_COLLECTION_TYPE property_list={}, WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list={})
Constructor.
Definition: BatchComputeService.cpp:58
static const std::string RESOURCE_DESCRIPTION_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to request information on its resources...
Definition: ComputeServiceMessagePayload.h:74
static const std::string IS_THERE_AT_LEAST_ONE_HOST_WITH_AVAILABLE_RESOURCES_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message by the the daemon to state whether one host has some resou...
Definition: ComputeServiceMessagePayload.h:80
bool supportsCompoundJobs() override
Returns true if the service supports compound jobs.
Definition: BatchComputeService.cpp:1462
static const std::string BATSCHED_CONTIGUOUS_ALLOCATION
Controls Batsched node allocation policy.
Definition: BatchComputeServiceProperty.h:160
static const std::string HOST_SELECTION_ALGORITHM
The host selection algorithm. Can be:
Definition: BatchComputeServiceProperty.h:62
The compute service base class.
Definition: ComputeService.h:34
static const std::string SUBMIT_TIME_OF_FIRST_JOB_IN_WORKLOAD_TRACE_FILE
A specification of the submit time of the first job in a provided trace file.
Definition: BatchComputeServiceProperty.h:111
static const std::string SIMULATED_WORKLOAD_TRACE_FILE
Path to a workload trace file to be replayed. The trace file can be be in the SWF format (see http://...
Definition: BatchComputeServiceProperty.h:85
static const std::string TERMINATE_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a pilot job.
Definition: ComputeServiceMessagePayload.h:64
bool supportsStandardJobs() override
Returns true if the service supports standard jobs.
Definition: BatchComputeService.cpp:1454
A batch_standard_and_pilot_jobs-scheduled compute service that manages a set of compute hosts and con...
Definition: BatchComputeService.h:49
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that it has completed a standa...
Definition: ComputeServiceMessagePayload.h:31
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running standard job ha...
Definition: ComputeServiceMessagePayload.h:33
Definition: Action.cpp:28
static const std::string TERMINATE_STANDARD_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a standard job.
Definition: ComputeServiceMessagePayload.h:35
static const std::string SUBMIT_PILOT_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent from the daemon to acknowledge a pilot job submission...
Definition: ComputeServiceMessagePayload.h:56
static const std::string COMPOUND_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a running compound job ha...
Definition: ComputeServiceMessagePayload.h:47
static const std::string TASK_STARTUP_OVERHEAD
The overhead to start a task execution, in seconds.
Definition: BatchComputeServiceProperty.h:27
static const std::string BATSCHED_LOGGING_MUTED
Controls Batsched logging.
Definition: BatchComputeServiceProperty.h:150
bool supportsPilotJobs() override
Returns true if the service supports pilot jobs.
Definition: BatchComputeService.cpp:1470
static const std::string TASK_SELECTION_ALGORITHM
The algorithm to pick which ready computational task (within a standard job executed by the batch_sta...
Definition: BatchComputeServiceProperty.h:72
static const std::string RESOURCE_DESCRIPTION_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state information on its resources.
Definition: ComputeServiceMessagePayload.h:76
static const std::string OUTPUT_CSV_JOB_LOG
Path to a to-be-generated Batsim-style CSV trace file (e.g. for b3atch schedule visualization purpose...
Definition: BatchComputeServiceProperty.h:122
static const std::string STOP_DAEMON_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate it.
Definition: ServiceMessagePayload.h:35
static const std::string PILOT_JOB_STARTED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to state that a pilot job has started.
Definition: ComputeServiceMessagePayload.h:58
static const std::string SUBMIT_STANDARD_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a standard job submissio...
Definition: ComputeServiceMessagePayload.h:29
static const std::string BATCH_SCHEDULING_ALGORITHM
The batch_standard_and_pilot_jobs scheduling algorithm. Can be:
Definition: BatchComputeServiceProperty.h:43
static const std::string SUBMIT_PILOT_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:54
static const std::string BATCH_RJMS_PADDING_DELAY
Integral number of seconds that the Batch Scheduler adds to the runtime of each incoming job....
Definition: BatchComputeServiceProperty.h:132
static const std::string SUBMIT_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to submit a pilot job.
Definition: ComputeServiceMessagePayload.h:41
static const std::string BATCH_QUEUE_ORDERING_ALGORITHM
The batch_standard_and_pilot_jobs queue ordering algorithm. Can be:
Definition: BatchComputeServiceProperty.h:52
static const std::string TERMINATE_COMPOUND_JOB_REQUEST_MESSAGE_PAYLOAD
The number of bytes in the control message sent to the daemon to terminate a compound job.
Definition: ComputeServiceMessagePayload.h:49
static const std::string TERMINATE_COMPOUND_JOB_ANSWER_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the daemon to acknowledge a compound job terminati...
Definition: ComputeServiceMessagePayload.h:51