10 #ifndef WRENCH_MULTINODEMULTICORESTANDARDJOBEXECUTOR_H
11 #define WRENCH_MULTINODEMULTICORESTANDARDJOBEXECUTOR_H
17 #include "wrench/services/compute/ComputeService.h"
18 #include "wrench/services/compute/workunit_executor/WorkunitExecutor.h"
19 #include "wrench/services/compute/standard_job_executor/StandardJobExecutorProperty.h"
20 #include "wrench/services/compute/standard_job_executor/StandardJobExecutorMessagePayload.h"
21 #include "wrench/services/compute/workunit_executor/Workunit.h"
22 #include "wrench/services/helpers/HostStateChangeDetector.h"
54 std::string callback_mailbox,
56 std::shared_ptr<StandardJob> job,
57 std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
58 std::shared_ptr<StorageService> scratch_space,
59 bool part_of_pilot_job,
61 std::map<std::string, std::string> property_list,
65 void kill(
bool job_termination);
67 std::shared_ptr<StandardJob>
getJob();
76 void cleanup(
bool has_returned_from_main,
int return_value)
override;
78 std::string callback_mailbox;
79 std::shared_ptr<StandardJob> job;
80 std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
83 std::shared_ptr<StorageService> scratch_space;
85 bool part_of_pilot_job;
91 std::set<WorkflowFile*> files_stored_in_scratch;
94 std::map<std::string, unsigned long> core_availabilities;
96 std::map<std::string, double> ram_availabilities;
99 std::set<std::shared_ptr<WorkunitExecutor>> running_workunit_executors;
100 std::set<std::shared_ptr<WorkunitExecutor>> finished_workunit_executors;
101 std::set<std::shared_ptr<WorkunitExecutor>> failed_workunit_executors;
104 std::set<std::shared_ptr<Workunit>> non_ready_workunits;
105 std::set<std::shared_ptr<Workunit>> ready_workunits;
106 std::set<std::shared_ptr<Workunit>> running_workunits;
107 std::set<std::shared_ptr<Workunit>> completed_workunits;
110 std::map<std::string, std::string> property_list;
112 std::map<std::string, std::string> default_property_values = {
120 std::map<std::string, double> default_messagepayload_values = {
125 std::shared_ptr<HostStateChangeDetector> host_state_monitor;
129 void processWorkunitExecutorCompletion(std::shared_ptr<WorkunitExecutor> workunit_executor,
130 std::shared_ptr<Workunit> workunit);
132 void processWorkunitExecutorFailure(std::shared_ptr<WorkunitExecutor> workunit_executor,
133 std::shared_ptr<Workunit> workunit,
134 std::shared_ptr<FailureCause> cause);
136 void processWorkunitExecutorCrash(std::shared_ptr<WorkunitExecutor> workunit_executor);
138 bool processNextMessage();
140 unsigned long computeWorkUnitMinNumCores(
Workunit *wu);
141 unsigned long computeWorkUnitDesiredNumCores(
Workunit *wu);
142 double computeWorkUnitMinMemory(
Workunit *wu);
144 void dispatchReadyWorkunits();
148 std::vector<std::shared_ptr<Workunit>> sortReadyWorkunits();
151 void cleanUpScratch();
154 void StoreListOfFilesInScratch();
165 #endif //WRENCH_MULTINODEMULTICORESTANDARDJOBEXECUTOR_H
static const std::string STANDARD_JOB_DONE_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the executor to state that it has completed a job.
Definition: StandardJobExecutorMessagePayload.h:30
StandardJobExecutor(Simulation *simulation, std::string callback_mailbox, std::string hostname, std::shared_ptr< StandardJob > job, std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::shared_ptr< StorageService > scratch_space, bool part_of_pilot_job, PilotJob *parent_pilot_job, std::map< std::string, std::string > property_list, std::map< std::string, double > messagepayload_list)
Constructor.
Definition: StandardJobExecutor.cpp:64
static const std::string STANDARD_JOB_FAILED_MESSAGE_PAYLOAD
The number of bytes in the control message sent by the executor to state that a job has failed.
Definition: StandardJobExecutorMessagePayload.h:32
void kill(bool job_termination)
Kill the executor.
Definition: StandardJobExecutor.cpp:257
~StandardJobExecutor()
Destructor.
Definition: StandardJobExecutor.cpp:40
std::shared_ptr< StandardJob > getJob()
Get the executor's job.
Definition: StandardJobExecutor.cpp:990
A class to describe a unit of work that's a sub-component of a StandardJob.
Definition: Workunit.h:35
static const std::string CORE_ALLOCATION_ALGORITHM
The algorithm that decides how many cores are given to a computational task. Possible values are:
Definition: StandardJobExecutorProperty.h:41
static const std::string TASK_SELECTION_ALGORITHM
The algorithm that decides which ready computational task, in case multiple tasks are ready,...
Definition: StandardJobExecutorProperty.h:49
static const std::string HOST_SELECTION_ALGORITHM
The algorithm that decides on which host a task should be placed. Possible values are:
Definition: StandardJobExecutorProperty.h:55
A service that knows how to execute a standard job on a multi-node multi-core platform....
Definition: StandardJobExecutor.h:45
static const std::string TASK_STARTUP_OVERHEAD
The number of seconds to start a task (default = 0)
Definition: StandardJobExecutorProperty.h:30
std::map< std::string, std::tuple< unsigned long, double > > getComputeResources()
Get the executor's compute resources.
Definition: StandardJobExecutor.cpp:998
std::map< std::string, double > messagepayload_list
The service's messagepayload list.
Definition: Service.h:112
std::string hostname
The name of the host on which the daemon is running.
Definition: S4U_Daemon.h:51
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:46
static const std::string SIMULATE_COMPUTATION_AS_SLEEP
Simulate computation as just a sleep instead of with an actual compute thread. This is for scalabilit...
Definition: StandardJobExecutorProperty.h:61
std::set< WorkflowFile * > getFilesInScratch()
Get the set of files stored in scratch space during the standard job's execution.
Definition: StandardJobExecutor.cpp:982
A pilot (i.e., non-standard) workflow job that can be submitted to a ComputeService by a WMS (via a J...
Definition: PilotJob.h:27
A service that can be added to the simulation and that can be used by a WMS when executing a workflow...
Definition: Service.h:26
Simulation * simulation
a pointer to the simulation object
Definition: S4U_Daemon.h:105