WRENCH  1.11
Cyberinfrastructure Simulation Workbench
Overview Installation Getting Started WRENCH 101 WRENCH 102
ActionExecutionService.h
1 
10 #ifndef WRENCH_ACTION_SCHEDULER_H
11 #define WRENCH_ACTION_SCHEDULER_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/helper_services/host_state_change_detector/HostStateChangeDetector.h"
18 #include "wrench/services/helper_services/action_execution_service/ActionExecutionServiceProperty.h"
19 
20 
21 namespace wrench {
22 
23  class Simulation;
24  class FailureCause;
25  class Alarm;
26  class Action;
27  class ActionExecutor;
28 
29 
39 
40  private:
41 
42  WRENCH_PROPERTY_COLLECTION_TYPE default_property_values = {
45  };
46 
47  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE default_messagepayload_values = {
48  };
49 
50  public:
51 
52  // Public Constructor
53  ActionExecutionService(const std::string &hostname,
54  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
55  std::shared_ptr<Service> parent_service,
56  WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
57  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
58  );
59 
60  /***********************/
62  /***********************/
63 
64  bool actionCanRun(std::shared_ptr<Action> action);
65 
66  std::shared_ptr<Service> getParentService() const;
67 
68  void setParentService(std::shared_ptr<Service> parent);
69 
70  void submitAction(const std::shared_ptr<Action> &action);
71 
72  void terminateAction(std::shared_ptr<Action> action, ComputeService::TerminationCause termination_cause);
73 
74  bool IsThereAtLeastOneHostWithAvailableResources(unsigned long num_cores, double ram);
75 
76  std::map<std::string, std::tuple<unsigned long, double>> &getComputeResources();
77 
78  std::map<std::string, double> getResourceInformation(const std::string &key);
79 
81 
82  /***********************/
84  /***********************/
85 
86  private:
87 
88  friend class Simulation;
89 
90  void validateProperties();
91 
92 
93  std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
94 
95  // Core availabilities (for each hosts, how many cores and how many bytes of RAM are currently available on it)
96  std::unordered_map<std::string, double> ram_availabilities;
97  std::unordered_map<std::string, unsigned long> running_thread_counts;
98 
99  std::shared_ptr<Service> parent_service = nullptr;
100 
101  std::unordered_map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<DataFile>>> files_in_scratch;
102 
103  // Set of running jobs
104  std::set<std::shared_ptr<Action> > running_actions;
105 
106  // Action execution specs
107  std::unordered_map<std::shared_ptr<Action> , std::tuple<std::string, unsigned long>> action_run_specs;
108 
109  std::set<std::shared_ptr<Action>> all_actions;
110  std::deque<std::shared_ptr<Action>> ready_actions;
111 
112  // Set of running ActionExecutors
113  std::unordered_map<std::shared_ptr<Action> , std::shared_ptr<ActionExecutor>> action_executors;
114 
115  int main() override;
116 
117  // Helper functions to make main() a bit more palatable
118 
119  void terminate(bool send_failure_notifications, ComputeService::TerminationCause termination_cause);
120 
121  void failCurrentActions();
122 
123  void processActionExecutorCompletion(std::shared_ptr<ActionExecutor> executor);
124 
125  void processActionExecutorFailure(std::shared_ptr<ActionExecutor> executor);
126 
127  void processActionExecutorCrash(std::shared_ptr<ActionExecutor> executor);
128 
129  void processActionTerminationRequest(std::shared_ptr<Action> action, simgrid::s4u::Mailbox *answer_mailbox, ComputeService::TerminationCause termination_cause);
130 
131  bool processNextMessage();
132 
133  void dispatchReadyActions();
134 
135 // void someHostIsBackOn(simgrid::s4u::Host const &h);
136 // bool host_back_on = false;
137 
138 
140  enum JobTerminationCause {
142  TERMINATED,
143 
145  COMPUTE_SERVICE_KILLED
146  };
147 
148  void terminateRunningAction(std::shared_ptr<Action> action, bool killed_due_to_job_cancelation);
149 
150  void killAction(std::shared_ptr<Action> action, std::shared_ptr<FailureCause> cause);
151 
152 
153  void processSubmitAction(simgrid::s4u::Mailbox *answer_mailbox, std::shared_ptr<Action> action);
154 
155  std::tuple<std::string, unsigned long> pickAllocation(std::shared_ptr<Action> action,
156  std::string required_host, unsigned long required_num_cores,
157  std::set<std::string> &hosts_to_avoid);
158 
159 
160  bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
161 
162  void cleanup(bool has_terminated_cleanly, int return_value) override;
163 
164  bool areAllComputeResourcesDownWithNoActionExecutorRunning();
165 
166 
167  int exit_code = 0;
168 
169  std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
170 
171  };
172 };
173 
174 
175 #endif //WRENCH_ACTION_SCHEDULER_H
wrench::ActionExecutionServiceProperty::FAIL_ACTION_AFTER_ACTION_EXECUTOR_CRASH
static const std::string FAIL_ACTION_AFTER_ACTION_EXECUTOR_CRASH
If true, fail action after an executor crash, otherwise re-ready it and try again
Definition: ActionExecutionServiceProperty.h:33
wrench::Service::property_list
WRENCH_PROPERTY_COLLECTION_TYPE property_list
The service's property list.
Definition: Service.h:110
wrench::ActionExecutionService::actionCanRun
bool actionCanRun(std::shared_ptr< Action > action)
Helper method that determines whether a submitted action (with service-specific arguments) can run gi...
Definition: ActionExecutionService.cpp:890
wrench::ComputeService::TerminationCause
TerminationCause
Job termination cause enum.
Definition: ComputeService.h:67
wrench::ActionExecutionService
An action execution service that:
Definition: ActionExecutionService.h:38
wrench::ActionExecutionService::getResourceInformation
std::map< std::string, double > getResourceInformation(const std::string &key)
Return resource information.
Definition: ActionExecutionService.cpp:1028
wrench::Service::messagepayload_list
WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list
The service's messagepayload list.
Definition: Service.h:113
wrench::ActionExecutionService::ActionExecutionService
ActionExecutionService(const std::string &hostname, std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::shared_ptr< Service > parent_service, WRENCH_PROPERTY_COLLECTION_TYPE property_list={}, WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list={})
Constructor.
Definition: ActionExecutionService.cpp:205
wrench::ActionExecutionService::submitAction
void submitAction(const std::shared_ptr< Action > &action)
Submit an action to the action execution service service.
Definition: ActionExecutionService.cpp:108
wrench
Definition: Action.cpp:28
wrench::ActionExecutionService::~ActionExecutionService
~ActionExecutionService()
Destructor.
Definition: ActionExecutionService.cpp:42
wrench::ActionExecutionService::getComputeResources
std::map< std::string, std::tuple< unsigned long, double > > & getComputeResources()
Get a (reference to) the compute resources of this service.
Definition: ActionExecutionService.cpp:1175
wrench::ActionExecutionService::getParentService
std::shared_ptr< Service > getParentService() const
Get the parent compute service (could be nullptr if stand-alone)
Definition: ActionExecutionService.cpp:1183
wrench::S4U_Daemon::hostname
std::string hostname
The name of the host on which the daemon is running.
Definition: S4U_Daemon.h:60
wrench::Simulation
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:48
wrench::ActionExecutionService::setParentService
void setParentService(std::shared_ptr< Service > parent)
Set parent service.
Definition: ActionExecutionService.cpp:1167
wrench::ActionExecutionServiceProperty::TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
static const std::string TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN
Whether the ActionExecutionService should terminate if all its hosts are down.
Definition: ActionExecutionServiceProperty.h:30
wrench::Service
A service that can be added to the simulation and that can be used by a WMS when executing a workflow...
Definition: Service.h:31
wrench::ActionExecutionService::IsThereAtLeastOneHostWithAvailableResources
bool IsThereAtLeastOneHostWithAvailableResources(unsigned long num_cores, double ram)
Determine whether there is at least one host with (currently) available resources.
Definition: ActionExecutionService.cpp:992
wrench::ActionExecutionService::terminateAction
void terminateAction(std::shared_ptr< Action > action, ComputeService::TerminationCause termination_cause)
Synchronously terminate an action.
Definition: ActionExecutionService.cpp:733