BusErrorHandler.cpp
Go to the documentation of this file.
1#include "BusErrorHandler.h"
2
3#include <variant>
4
5#include <ethercat.h>
6
8
11
13
14#include "Bus.h"
15#include "ErrorReporting.h"
16#include "EtherCATState.h"
17#include "SlaveInterface.h"
18
20{
21 static constexpr unsigned int workerCounterWatchdogTimeout = 20000;
22
24 bus(bus), workerCounterWatchdog(workerCounterWatchdogTimeout)
25 {
26 slaveReinitializingThreadRunning.store(true);
27 slaveReinitializingThread = std::thread(&BusErrorHandler::slaveReinitializingLoop, this);
28 }
29
31 {
32 slaveReinitializingThreadRunning.store(false);
33 slaveReinitializingThread.join();
34 }
35
36 void
38 {
39 ARMARX_CHECK(expectedWorkCounter > 0);
40 rtUpdateSlaveStates();
41 }
42
43 void
45 {
46 BUS_TIMING_START(bus_rtHandleSlaveErrors_getErrorReporter)
48 BUS_TIMING_CEND(bus_rtHandleSlaveErrors_getErrorReporter, bus->iterationCount, 0.3)
49
50 for (const auto& slave : bus->getSlaves())
51 {
52 //try to clear error if there exist some, the rest of the slaves can run normal
53 SLAVE_TIMING_START(bus_rtHandleSlaveErrors_slaveHasError)
54 auto hasError = slave->hasError();
56 slave->getSlaveIdentifier(), bus_rtHandleSlaveErrors_slaveHasError, 0.3)
57 if (hasError)
58 {
59
60 SLAVE_TIMING_START(bus_rtHandleSlaveErrors_slaveHandleError)
61 if (!slave->handleErrors())
62 {
64 reporter, slave->getSlaveIdentifier(), "Unhandled error in slave")
65 .deactivateSpam(1);
66 }
68 slave->getSlaveIdentifier(), bus_rtHandleSlaveErrors_slaveHandleError, 0.3)
69 }
70 }
71
72 slaveErrorFound = reporter.hasErrors();
73 }
74
75 bool
77 {
78 return busErrorFound || slaveErrorFound;
79 }
80
81 bool
83 {
84 return !allSlavesReinitialized.load();
85 }
86
87 bool
89 {
90 return slaveStates.at(slave) != SlaveState::Operational;
91 }
92
93 void
95 {
96 expectedWorkCounter = wkc;
97
98 for (const auto& s : bus->getSlaves())
99 {
100 slaveStates.insert({s.get(), SlaveState::Operational});
101 }
102 }
103
104 void
105 BusErrorHandler::rtUpdateSlaveStates()
106 {
107 if (bus->lastWorkCounter < expectedWorkCounter)
108 {
109 if (workerCounterWatchdog.expired())
110 {
111 if (bus->lastWorkCounter == EC_NOFRAME && !allSlavesReinitialized.load())
112 {
113 // Panic!! Bus unsable?
114 BUS_WARNING(bus->iterationCount,
115 "The last work counter is EC_NOFRAME not all slaves have been "
116 "reinitialized yet!");
117 }
118 else if (bus->lastWorkCounter != secondLastUnexpectedWorkCounter)
119 {
120 BUS_INFO(bus->iterationCount,
121 "Last Work Counter: %i, Second Last Unexpected Work Counter: %i, "
122 "Expected Work Counter: %i",
123 bus->lastWorkCounter,
124 secondLastUnexpectedWorkCounter,
125 expectedWorkCounter);
126 secondLastUnexpectedWorkCounter = bus->lastWorkCounter;
127 // Lost slaves or found slaves again or slaves are not in op
128
129 // Check for lost slaves and mark them
130 rtMarkLostSlavesInSOEMStruct();
131
132 std::stringstream lost_ss;
133 std::stringstream redisc_ss;
134 bool print_lost = false;
135 bool print_redisc = false;
136 lost_ss << "The following slaves have been lost: \n";
137 redisc_ss << "The following slaves have been rediscovered: \n";
138 for (const auto& slave : bus->getSlaves())
139 {
140 if (ec_slave[slave->getSlaveIdentifier().slaveIndex].islost)
141 {
142 if (slaveStates.at(slave.get()) != SlaveState::Lost)
143 {
144 lost_ss << slave->getSlaveIdentifier().getNameAsCStr() << "("
145 << slave->getSlaveIdentifier().slaveIndex << ")\n";
146 busErrorFound = true;
147 print_lost = true;
148 }
149
150 slaveStates.at(slave.get()) = SlaveState::Lost;
151 allSlavesReinitialized.store(false);
152 }
153 else
154 {
155 if (slaveStates.at(slave.get()) == SlaveState::Lost)
156 {
157 slaveStates.at(slave.get()) = SlaveState::Reinitializating;
158 print_redisc = true;
159 redisc_ss << slave->getSlaveIdentifier().getNameAsCStr() << "("
160 << slave->getSlaveIdentifier().slaveIndex << ")\n";
161 }
162 }
163 }
164 if (print_lost)
165 {
166 BUS_ERROR(bus->iterationCount, lost_ss.str().c_str());
167 }
168 if (print_redisc)
169 {
170 BUS_INFO(bus->iterationCount, redisc_ss.str().c_str());
171 }
172
173 // Read bus states and add all slaves that need reinitialization to list
174 if (bus->readStates() != EtherCATState::invalid)
175 {
176 for (const auto& slave : bus->getSlaves())
177 {
178 if (EtherCATState(
179 ec_slave[slave->getSlaveIdentifier().slaveIndex].state) !=
181 ec_slave[slave->getSlaveIdentifier().slaveIndex].islost == 0 &&
182 (slaveStates.at(slave.get()) != SlaveState::Reinitializating))
183 {
184 slaveStates.at(slave.get()) = SlaveState::Reinitializating;
185
186 // For some slaves, specifically the hands, it might happen that
187 // they need to be reinitialized, but a technical link loss never
188 // occurred. The EtherCAT adapter of the hands is on the wrist and
189 // is connected via an SPI connection to the hand micro controller.
190 // This SPI connection is prone to noise etc. and can be
191 // interrupted. This results in the hand freezing from a user
192 // perspective, while the slave (the EtherCAT chip on the wrist
193 // board) is still responding and connected. Thus, there will never
194 // be a link loss in the error registers, but it still must be
195 // reinitialized. This happens after a watchdog in the hand
196 // firmware detects that the SPI connection was interrupted and
197 // restarts the firmware. This will also reinitialize the slave.
198 // From the bus' perspecive it can be seen that at some point the
199 // slave will leave op state and transition to init back again.
200 // If this happens, then this branch will be executed. The bus
201 // acknowledge that the slave needs reinitialization, but because of
202 // no link loss, no action for reinitialization would be taken.
203 // Setting the variable `allSlavesReinitialized` to false as in the
204 // next line will trigger this reinitialization similar to the case
205 // where a link loss was detected via the errorr registers (see
206 // above).
207 allSlavesReinitialized.store(false);
208 // Trigger SS2 since hands need to be re-tared.
209 busErrorFound = true;
210
211 BUS_ERROR(bus->iterationCount,
212 "Slave at index %u of type %s needs reinitialization!",
213 slave->getSlaveIdentifier().slaveIndex,
214 slave->getSlaveIdentifier().getNameAsCStr());
215 }
216 }
217 }
218 }
219 }
220 }
221 else
222 {
223 busErrorFound = false;
224 workerCounterWatchdog.reset();
225 secondLastUnexpectedWorkCounter = bus->lastWorkCounter;
226 }
227
228 return;
229 }
230
231 void
232 BusErrorHandler::rtMarkLostSlavesInSOEMStruct() const
233 {
234 std::uint16_t w = 0;
235 const int slaveCount = ec_BRD(0x0000, ECT_REG_TYPE, sizeof(w), &w, EC_TIMEOUTSAFE);
236 if (slaveCount == EC_NOFRAME)
237 {
238 return;
239 }
240
241 if (slaveCount < ec_slavecount)
242 {
243 BUS_WARNING(bus->iterationCount,
244 "Number of Slaves (%i) in SOEM struct is less than configured Slaves (%i).",
245 slaveCount,
246 ec_slavecount);
247 // Create RegisterDataList with request to read register CONFIGURED_STATION_ADDRESS
248 std::vector<RegisterDataList> registers;
249 for (auto i = 1; i <= ec_slavecount; i++)
250 {
251 registers.push_back(
252 RegisterDataList{static_cast<std::uint16_t>(i),
255 }
256
257 // Read Registers
258 bus->readRegisters(registers);
259
260 // Iterate over read registers and check which slaves did not answer
261 std::stringstream ss;
262 ss << "Slaves with the following indexes are not reachable: \n";
263 for (const auto& dataList : registers)
264 {
265 auto data =
267 if (std::holds_alternative<datatypes::EtherCATDataType::UNSIGNED16>(data) &&
268 ec_slave[dataList.slaveIndex].configadr ==
269 std::get<datatypes::EtherCATDataType::UNSIGNED16>(data))
270 {
271 ec_slave[dataList.slaveIndex].islost = false;
272 }
273 else
274 {
275 ss << dataList.slaveIndex << ", ";
276 ec_slave[dataList.slaveIndex].islost = true;
277 }
278 }
279 ss.seekp(ss.str().length() - 2);
280 ss << "\n The total number of registers read is " << registers.size() << "\n";
281 BUS_ERROR(bus->iterationCount, "%s", ss.str().c_str());
282 }
283 else
284 {
285 // All slaves are found
286 for (int i = 1; i <= ec_slavecount; i++)
287 {
288 ec_slave[i].islost = false;
289 }
290 }
291 }
292
293 void
294 BusErrorHandler::reinitializeSlaves()
295 {
296 // Update ec_slave with the current slave states
297 bus->readStates();
298
299 for (const auto& [slave, state] : slaveStates)
300 {
301 if (state != SlaveState::Reinitializating)
302 {
303 continue;
304 }
305
306 std::uint16_t index =
307 static_cast<std::uint16_t>(slave->getSlaveIdentifier().slaveIndex);
308
309 bool slaveReadyAgain = false;
310 // TIMING_START(handleFound_switchEtherCATState)
311 switch (EtherCATState(ec_slave[index].state))
312 {
315 // This state gets reached when the corresponding slave did a full power cycle
316 // and needs to be initialized from scratch ...
317 // The first step is to recover the slave to validate that the slave is the same
318 // as we lost at that position in the bus.
319 if (ec_recover_slave(index, 10000))
320 {
321 // Since the slave is in fact the same as before,
322 // we can change its state to preOp
323 bus->changeStateOfSlave(index, EtherCATState::preOp, false);
324 }
325 else
326 {
327 BUS_ERROR(bus->iterationCount,
328 "Could not recover slave: %s",
329 slave->getSlaveIdentifier().getNameAsCStr());
330 }
331
332 // TIMING_END_COMMENT(handleFound_switchEtherCATState,
333 // "ReinitializeSlaves - EtherCATState: init (" +
334 // slave->getSlaveIdentifier().getName() + ")")
335 break;
336
338 // SOEM already provides a function for reconfiguring lost slaves.
339 // That function executes a hook for switching from preOp to safeOp which we have to
340 // set to a lambda which executes our default functions for that transition:
341 ec_slave[index].PO2SOconfig = [](std::uint16_t index) -> int
342 {
343 Bus& bus = Bus::getBus();
344
345 auto localSlave = bus.getSlaveAtIndex(index);
346
347 localSlave->doMappings();
348 localSlave->prepareForSafeOp();
349 localSlave->finishPreparingForSafeOp();
350
351 return 0;
352 };
353 // After executing this function, the corresponding slave will be in safeOp
354 ec_reconfig_slave(index, 10000);
355
356 // Better reset that hook, just to be sure
357 ec_slave[index].PO2SOconfig = nullptr;
358 // TIMING_END_COMMENT(handleFound_switchEtherCATState,
359 // "ReinitializeSlaves - EtherCATState: preOp (" +
360 // slave->getSlaveIdentifier().getName() + ")")
361 break;
362
364 // Execute hooks for safeOp to op, similar to the normal bus initialization.
365 slave->prepareForOp();
366 slave->finishPreparingForOp();
367
368 bus->changeStateOfSlave(index, EtherCATState::op, false);
369
370 // TIMING_END_COMMENT(handleFound_switchEtherCATState,
371 // "ReinitializeSlaves - EtherCATState: safeOp (" +
372 // slave->getSlaveIdentifier().getName() + ")")
373 break;
374
376 // This state is reached if the slaves have not been updated for a long time
377 // (>50ms) and the sync manager watchdog gets triggered.
378 // The only way to recover from that is to reinit the slave by switching its
379 // state to init
380 ec_recover_slave(index, 10000);
381
382 bus->changeStateOfSlave(index, EtherCATState::init, false);
383
384 // TIMING_END_COMMENT(handleFound_switchEtherCATState,
385 // "ReinitializeSlaves - EtherCATState: safeOpError (" +
386 // slave->getSlaveIdentifier().getName() + ")")
387 break;
388
390 if (slave->prepareForRun())
391 {
392 slaveReadyAgain = true;
393 }
394 // TIMING_END_COMMENT(handleFound_switchEtherCATState,
395 // "ReinitializeSlaves - EtherCATState: op (" +
396 // slave->getSlaveIdentifier().getName() + ")")
397 break;
398
399 default:
400 break;
401 }
402
403 // Slave was successfully recovered, we can remove it from the list
404 if (slaveReadyAgain)
405 {
406 slaveStates.at(slave) = SlaveState::Operational;
407
409 << "Slave at index " << slave->getSlaveIdentifier().slaveIndex << " of type "
410 << slave->getSlaveIdentifier().getName()
411 << " has been successfully reinitialized. Current bus iteration number is "
412 << bus->iterationCount;
413
414 allSlavesReinitialized.store(areAllSlavesReinitialized());
415 }
416 }
417 }
418
419 void
420 BusErrorHandler::slaveReinitializingLoop()
421 {
422 while (slaveReinitializingThreadRunning.load())
423 {
424 while (!allSlavesReinitialized.load())
425 {
426 reinitializeSlaves();
427 }
428
429 using namespace std::literals;
430 std::this_thread::sleep_for(100us);
431 }
432 }
433
434 bool
435 BusErrorHandler::areAllSlavesReinitialized() const
436 {
437 bool r = false;
438 for (const auto& [slave, state] : slaveStates)
439 {
440 r |= (state == SlaveState::Reinitializating);
441 }
442 return !r;
443 }
444
445 BusErrorHandler::Watchdog::Watchdog(std::uint32_t minDurationUS) :
446 minDuration(minDurationUS), lastTime(armarx::rtNow())
447 {
448 }
449
450 bool
451 BusErrorHandler::Watchdog::expired() const
452 {
453 return (armarx::rtNow() - lastTime).toMicroSeconds() > minDuration;
454 }
455
456 void
457 BusErrorHandler::Watchdog::reset()
458 {
459 lastTime = armarx::rtNow();
460 }
461
462} // namespace armarx::control::ethercat
#define SLAVE_ERROR_LOCAL(reporter, sid,...)
#define BUS_ERROR(bin,...)
#define BUS_WARNING(bin,...)
#define BUS_INFO(bin,...)
uint8_t data[1]
uint8_t index
#define BUS_TIMING_CEND(name, bin, thresholdMs)
Definition Timing.h:181
#define SLAVE_TIMING_CEND(sid, name, thresholdMs)
Definition Timing.h:203
#define BUS_TIMING_START(name)
Definition Timing.h:163
#define SLAVE_TIMING_START(name)
Definition Timing.h:186
bool isSlaveLostOrDuringReinitialization(SlaveInterface *slave) const
Brief description of class Bus.
Definition Bus.h:57
std::vector< std::experimental::observer_ptr< SlaveInterface > > getSlaves() const
Returns all identifiied slaves on the bus.
Definition Bus.cpp:1392
static Bus & getBus()
This returns the one and only Bus object.
Definition Bus.cpp:29
@ init
Initial state after switch a EtherCAT slave on.
@ invalid
State is not valid, e.g. if a request for reading the actual bus state failed.
@ safeOpError
Safe-operational state after an error has happend.
Brief description of class SlaveInterface.
#define ARMARX_CHECK(expression)
Shortcut for ARMARX_CHECK_EXPRESSION.
#define ARMARX_IMPORTANT
The logging level for always important information, but expected behaviour (in contrast to ARMARX_WAR...
Definition Logging.h:190
std::variant< EtherCATDataType::INTEGER8, EtherCATDataType::INTEGER16, EtherCATDataType::INTEGER32, EtherCATDataType::INTEGER64, EtherCATDataType::UNSIGNED8, EtherCATDataType::UNSIGNED16, EtherCATDataType::UNSIGNED32, EtherCATDataType::UNSIGNED64 > RegisterEnumTypeContainer
IceUtil::Time rtNow()
Definition RtTiming.h:40