WRENCH  1.11
Cyberinfrastructure Simulation Workbench
Overview Installation Getting Started WRENCH 101 WRENCH 102
ActionExecutionService.h
1 
10 #ifndef WRENCH_ACTION_SCHEDULER_H
11 #define WRENCH_ACTION_SCHEDULER_H
12 
13 
14 #include <queue>
15 
16 #include "wrench/services/compute/ComputeService.h"
17 #include "wrench/services/helper_services/host_state_change_detector/HostStateChangeDetector.h"
18 #include "wrench/services/helper_services/action_execution_service/ActionExecutionServiceProperty.h"
19 
20 
21 namespace wrench {
22 
23  class Simulation;
24  class FailureCause;
25  class Alarm;
26  class Action;
27  class ActionExecutor;
28 
29 
39 
40  private:
41 
42  WRENCH_PROPERTY_COLLECTION_TYPE default_property_values = {
43  {ActionExecutionServiceProperty::TERMINATE_WHENEVER_ALL_RESOURCES_ARE_DOWN, "false"},
44  {ActionExecutionServiceProperty::FAIL_ACTION_AFTER_ACTION_EXECUTOR_CRASH, "true"},
45  };
46 
47  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE default_messagepayload_values = {
48  };
49 
50  public:
51 
52  // Public Constructor
53  ActionExecutionService(const std::string &hostname,
54  std::map<std::string, std::tuple<unsigned long, double>> compute_resources,
55  std::shared_ptr<Service> parent_service,
56  WRENCH_PROPERTY_COLLECTION_TYPE property_list = {},
57  WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list = {}
58  );
59 
60  /***********************/
62  /***********************/
63 
64  bool actionCanRun(std::shared_ptr<Action> action);
65 
66  std::shared_ptr<Service> getParentService() const;
67 
68  void setParentService(std::shared_ptr<Service> parent);
69 
70  void submitAction(const std::shared_ptr<Action> &action);
71 
72  void terminateAction(std::shared_ptr<Action> action, ComputeService::TerminationCause termination_cause);
73 
74  bool IsThereAtLeastOneHostWithAvailableResources(unsigned long num_cores, double ram);
75 
76  std::map<std::string, std::tuple<unsigned long, double>> &getComputeResources();
77 
78  std::map<std::string, double> getResourceInformation(const std::string &key);
79 
81 
82  /***********************/
84  /***********************/
85 
86  private:
87 
88  friend class Simulation;
89 
90  void validateProperties();
91 
92 
93  std::map<std::string, std::tuple<unsigned long, double>> compute_resources;
94 
95  // Core availabilities (for each hosts, how many cores and how many bytes of RAM are currently available on it)
96  std::unordered_map<std::string, double> ram_availabilities;
97  std::unordered_map<std::string, unsigned long> running_thread_counts;
98 
99  std::shared_ptr<Service> parent_service = nullptr;
100 
101  std::unordered_map<std::shared_ptr<StandardJob> , std::set<std::shared_ptr<DataFile>>> files_in_scratch;
102 
103  // Set of running jobs
104  std::set<std::shared_ptr<Action> > running_actions;
105 
106  // Action execution specs
107  std::unordered_map<std::shared_ptr<Action> , std::tuple<std::string, unsigned long>> action_run_specs;
108 
109  std::set<std::shared_ptr<Action>> all_actions;
110  std::deque<std::shared_ptr<Action>> ready_actions;
111 
112  // Set of running ActionExecutors
113  std::unordered_map<std::shared_ptr<Action> , std::shared_ptr<ActionExecutor>> action_executors;
114 
115  int main() override;
116 
117  // Helper functions to make main() a bit more palatable
118 
119  void terminate(bool send_failure_notifications, ComputeService::TerminationCause termination_cause);
120 
121  void failCurrentActions();
122 
123  void processActionExecutorCompletion(std::shared_ptr<ActionExecutor> executor);
124 
125  void processActionExecutorFailure(std::shared_ptr<ActionExecutor> executor);
126 
127  void processActionExecutorCrash(std::shared_ptr<ActionExecutor> executor);
128 
129  void processActionTerminationRequest(std::shared_ptr<Action> action, simgrid::s4u::Mailbox *answer_mailbox, ComputeService::TerminationCause termination_cause);
130 
131  bool processNextMessage();
132 
133  void dispatchReadyActions();
134 
135 // void someHostIsBackOn(simgrid::s4u::Host const &h);
136 // bool host_back_on = false;
137 
138 
140  enum JobTerminationCause {
142  TERMINATED,
143 
145  COMPUTE_SERVICE_KILLED
146  };
147 
148  void terminateRunningAction(std::shared_ptr<Action> action, bool killed_due_to_job_cancelation);
149 
150  void killAction(std::shared_ptr<Action> action, std::shared_ptr<FailureCause> cause);
151 
152 
153  void processSubmitAction(simgrid::s4u::Mailbox *answer_mailbox, std::shared_ptr<Action> action);
154 
155  std::tuple<std::string, unsigned long> pickAllocation(std::shared_ptr<Action> action,
156  std::string required_host, unsigned long required_num_cores,
157  std::set<std::string> &hosts_to_avoid);
158 
159 
160  bool isThereAtLeastOneHostWithResources(unsigned long num_cores, double ram);
161 
162  void cleanup(bool has_terminated_cleanly, int return_value) override;
163 
164  bool areAllComputeResourcesDownWithNoActionExecutorRunning();
165 
166 
167  int exit_code = 0;
168 
169  std::shared_ptr<HostStateChangeDetector> host_state_change_monitor;
170 
171  };
172 };
173 
174 
175 #endif //WRENCH_ACTION_SCHEDULER_H
wrench::ComputeService::TerminationCause
TerminationCause
Job termination cause enum.
Definition: ComputeService.h:67
wrench::ActionExecutionService
An action execution service that:
Definition: ActionExecutionService.h:38
wrench::ActionExecutionService::ActionExecutionService
ActionExecutionService(const std::string &hostname, std::map< std::string, std::tuple< unsigned long, double >> compute_resources, std::shared_ptr< Service > parent_service, WRENCH_PROPERTY_COLLECTION_TYPE property_list={}, WRENCH_MESSAGE_PAYLOADCOLLECTION_TYPE messagepayload_list={})
Constructor.
Definition: ActionExecutionService.cpp:205
wrench
Definition: Action.cpp:28
wrench::Simulation
A class that provides basic simulation methods. Once the simulation object has been explicitly or imp...
Definition: Simulation.h:48
wrench::Service
A service that can be added to the simulation and that can be used by a WMS when executing a workflow...
Definition: Service.h:31