BusErrorHandler.cpp
Go to the documentation of this file.
1 #include "BusErrorHandler.h"
2 
3 #include <variant>
4 
5 #include <ethercat.h>
6 
8 
11 
13 
14 #include "Bus.h"
15 #include "ErrorReporting.h"
16 #include "EtherCATState.h"
17 #include "SlaveInterface.h"
18 
20 {
21  static constexpr unsigned int workerCounterWatchdogTimeout = 20000;
22 
24  bus(bus), workerCounterWatchdog(workerCounterWatchdogTimeout)
25  {
26  slaveReinitializingThreadRunning.store(true);
27  slaveReinitializingThread = std::thread(&BusErrorHandler::slaveReinitializingLoop, this);
28  }
29 
31  {
32  slaveReinitializingThreadRunning.store(false);
33  slaveReinitializingThread.join();
34  }
35 
36  void
38  {
39  ARMARX_CHECK(expectedWorkCounter > 0);
40  rtUpdateSlaveStates();
41  }
42 
43  void
45  {
46  BUS_TIMING_START(bus_rtHandleSlaveErrors_getErrorReporter)
48  BUS_TIMING_CEND(bus_rtHandleSlaveErrors_getErrorReporter, bus->iterationCount, 0.3)
49 
50  for (const auto& slave : bus->getSlaves())
51  {
52  //try to clear error if there exist some, the rest of the slaves can run normal
53  SLAVE_TIMING_START(bus_rtHandleSlaveErrors_slaveHasError)
54  auto hasError = slave->hasError();
56  slave->getSlaveIdentifier(), bus_rtHandleSlaveErrors_slaveHasError, 0.3)
57  if (hasError)
58  {
59 
60  SLAVE_TIMING_START(bus_rtHandleSlaveErrors_slaveHandleError)
61  if (!slave->handleErrors())
62  {
64  reporter, slave->getSlaveIdentifier(), "Unhandled error in slave")
65  .deactivateSpam(1);
66  }
68  slave->getSlaveIdentifier(), bus_rtHandleSlaveErrors_slaveHandleError, 0.3)
69  }
70  }
71 
72  slaveErrorFound = reporter.hasErrors();
73  }
74 
75  bool
77  {
78  return busErrorFound || slaveErrorFound;
79  }
80 
81  bool
83  {
84  return !allSlavesReinitialized.load();
85  }
86 
87  bool
89  {
90  return slaveStates.at(slave) != SlaveState::Operational;
91  }
92 
93  void
95  {
96  expectedWorkCounter = wkc;
97 
98  for (const auto& s : bus->getSlaves())
99  {
100  slaveStates.insert({s.get(), SlaveState::Operational});
101  }
102  }
103 
104  void
105  BusErrorHandler::rtUpdateSlaveStates()
106  {
107  if (bus->lastWorkCounter < expectedWorkCounter)
108  {
109  if (workerCounterWatchdog.expired())
110  {
111  if (bus->lastWorkCounter == EC_NOFRAME && !allSlavesReinitialized.load())
112  {
113  // Panic!! Bus unsable?
114  BUS_WARNING(bus->iterationCount,
115  "The last work counter is EC_NOFRAME not all slaves have been "
116  "reinitialized yet!");
117  }
118  else if (bus->lastWorkCounter != secondLastUnexpectedWorkCounter)
119  {
120  BUS_INFO(bus->iterationCount,
121  "Last Work Counter: %i, Second Last Unexpected Work Counter: %i, "
122  "Expected Work Counter: %i",
123  bus->lastWorkCounter,
124  secondLastUnexpectedWorkCounter,
125  expectedWorkCounter);
126  secondLastUnexpectedWorkCounter = bus->lastWorkCounter;
127  // Lost slaves or found slaves again or slaves are not in op
128 
129  // Check for lost slaves and mark them
130  rtMarkLostSlavesInSOEMStruct();
131 
132  std::stringstream lost_ss;
133  std::stringstream redisc_ss;
134  bool print_lost = false;
135  bool print_redisc = false;
136  lost_ss << "The following slaves have been lost: \n";
137  redisc_ss << "The following slaves have been rediscovered: \n";
138  for (const auto& slave : bus->getSlaves())
139  {
140  if (ec_slave[slave->getSlaveIdentifier().slaveIndex].islost)
141  {
142  if (slaveStates.at(slave.get()) != SlaveState::Lost)
143  {
144  lost_ss << slave->getSlaveIdentifier().getNameAsCStr() << "("
145  << slave->getSlaveIdentifier().slaveIndex << ")\n";
146  busErrorFound = true;
147  print_lost = true;
148  }
149 
150  slaveStates.at(slave.get()) = SlaveState::Lost;
151  allSlavesReinitialized.store(false);
152  }
153  else
154  {
155  if (slaveStates.at(slave.get()) == SlaveState::Lost)
156  {
157  slaveStates.at(slave.get()) = SlaveState::Reinitializating;
158  print_redisc = true;
159  redisc_ss << slave->getSlaveIdentifier().getNameAsCStr() << "("
160  << slave->getSlaveIdentifier().slaveIndex << ")\n";
161  }
162  }
163  }
164  if (print_lost)
165  {
166  BUS_ERROR(bus->iterationCount, lost_ss.str().c_str());
167  }
168  if (print_redisc)
169  {
170  BUS_INFO(bus->iterationCount, redisc_ss.str().c_str());
171  }
172 
173  // Read bus states and add all slaves that need reinitialization to list
174  if (bus->readStates() != EtherCATState::invalid)
175  {
176  for (const auto& slave : bus->getSlaves())
177  {
178  if (EtherCATState(
179  ec_slave[slave->getSlaveIdentifier().slaveIndex].state) !=
181  ec_slave[slave->getSlaveIdentifier().slaveIndex].islost == 0 &&
182  (slaveStates.at(slave.get()) != SlaveState::Reinitializating))
183  {
184  slaveStates.at(slave.get()) = SlaveState::Reinitializating;
185 
186  // For some slaves, specifically the hands, it might happen that
187  // they need to be reinitialized, but a technical link loss never
188  // occurred. The EtherCAT adapter of the hands is on the wrist and
189  // is connected via an SPI connection to the hand micro controller.
190  // This SPI connection is prone to noise etc. and can be
191  // interrupted. This results in the hand freezing from a user
192  // perspective, while the slave (the EtherCAT chip on the wrist
193  // board) is still responding and connected. Thus, there will never
194  // be a link loss in the error registers, but it still must be
195  // reinitialized. This happens after a watchdog in the hand
196  // firmware detects that the SPI connection was interrupted and
197  // restarts the firmware. This will also reinitialize the slave.
198  // From the bus' perspecive it can be seen that at some point the
199  // slave will leave op state and transition to init back again.
200  // If this happens, then this branch will be executed. The bus
201  // acknowledge that the slave needs reinitialization, but because of
202  // no link loss, no action for reinitialization would be taken.
203  // Setting the variable `allSlavesReinitialized` to false as in the
204  // next line will trigger this reinitialization similar to the case
205  // where a link loss was detected via the errorr registers (see
206  // above).
207  allSlavesReinitialized.store(false);
208  // Trigger SS2 since hands need to be re-tared.
209  busErrorFound = true;
210 
211  BUS_ERROR(bus->iterationCount,
212  "Slave at index %u of type %s needs reinitialization!",
213  slave->getSlaveIdentifier().slaveIndex,
214  slave->getSlaveIdentifier().getNameAsCStr());
215  }
216  }
217  }
218  }
219  }
220  }
221  else
222  {
223  busErrorFound = false;
224  workerCounterWatchdog.reset();
225  secondLastUnexpectedWorkCounter = bus->lastWorkCounter;
226  }
227 
228  return;
229  }
230 
231  void
232  BusErrorHandler::rtMarkLostSlavesInSOEMStruct() const
233  {
234  std::uint16_t w = 0;
235  const int slaveCount = ec_BRD(0x0000, ECT_REG_TYPE, sizeof(w), &w, EC_TIMEOUTSAFE);
236  if (slaveCount == EC_NOFRAME)
237  {
238  return;
239  }
240 
241  if (slaveCount < ec_slavecount)
242  {
243  BUS_WARNING(bus->iterationCount,
244  "Number of Slaves (%i) in SOEM struct is less than configured Slaves (%i).",
245  slaveCount,
246  ec_slavecount);
247  // Create RegisterDataList with request to read register CONFIGURED_STATION_ADDRESS
248  std::vector<RegisterDataList> registers;
249  for (auto i = 1; i <= ec_slavecount; i++)
250  {
251  registers.push_back(
252  RegisterDataList{static_cast<std::uint16_t>(i),
255  }
256 
257  // Read Registers
258  bus->readRegisters(registers);
259 
260  // Iterate over read registers and check which slaves did not answer
261  std::stringstream ss;
262  ss << "Slaves with the following indexes are not reachable: \n";
263  for (const auto& dataList : registers)
264  {
265  auto data =
266  dataList.registerData.at(datatypes::RegisterEnum::CONFIGURED_STATION_ADDRESS);
267  if (std::holds_alternative<datatypes::EtherCATDataType::UNSIGNED16>(data) &&
268  ec_slave[dataList.slaveIndex].configadr ==
269  std::get<datatypes::EtherCATDataType::UNSIGNED16>(data))
270  {
271  ec_slave[dataList.slaveIndex].islost = false;
272  }
273  else
274  {
275  ss << dataList.slaveIndex << ", ";
276  ec_slave[dataList.slaveIndex].islost = true;
277  }
278  }
279  ss.seekp(ss.str().length() - 2);
280  ss << "\n The total number of registers read is " << registers.size() << "\n";
281  BUS_ERROR(bus->iterationCount, "%s", ss.str().c_str());
282  }
283  else
284  {
285  // All slaves are found
286  for (int i = 1; i <= ec_slavecount; i++)
287  {
288  ec_slave[i].islost = false;
289  }
290  }
291  }
292 
293  void
294  BusErrorHandler::reinitializeSlaves()
295  {
296  // Update ec_slave with the current slave states
297  bus->readStates();
298 
299  for (const auto& [slave, state] : slaveStates)
300  {
301  if (state != SlaveState::Reinitializating)
302  {
303  continue;
304  }
305 
306  std::uint16_t index =
307  static_cast<std::uint16_t>(slave->getSlaveIdentifier().slaveIndex);
308 
309  bool slaveReadyAgain = false;
310  // TIMING_START(handleFound_switchEtherCATState)
311  switch (EtherCATState(ec_slave[index].state))
312  {
314  case EtherCATState::init:
315  // This state gets reached when the corresponding slave did a full power cycle
316  // and needs to be initialized from scratch ...
317  // The first step is to recover the slave to validate that the slave is the same
318  // as we lost at that position in the bus.
319  if (ec_recover_slave(index, 10000))
320  {
321  // Since the slave is in fact the same as before,
322  // we can change its state to preOp
324  }
325  else
326  {
327  BUS_ERROR(bus->iterationCount,
328  "Could not recover slave: %s",
329  slave->getSlaveIdentifier().getNameAsCStr());
330  }
331 
332  // TIMING_END_COMMENT(handleFound_switchEtherCATState,
333  // "ReinitializeSlaves - EtherCATState: init (" +
334  // slave->getSlaveIdentifier().getName() + ")")
335  break;
336 
338  // SOEM already provides a function for reconfiguring lost slaves.
339  // That function executes a hook for switching from preOp to safeOp which we have to
340  // set to a lambda which executes our default functions for that transition:
341  ec_slave[index].PO2SOconfig = [](std::uint16_t index) -> int
342  {
343  Bus& bus = Bus::getBus();
344 
345  auto localSlave = bus.getSlaveAtIndex(index);
346 
347  localSlave->doMappings();
348  localSlave->prepareForSafeOp();
349  localSlave->finishPreparingForSafeOp();
350 
351  return 0;
352  };
353  // After executing this function, the corresponding slave will be in safeOp
354  ec_reconfig_slave(index, 10000);
355 
356  // Better reset that hook, just to be sure
357  ec_slave[index].PO2SOconfig = nullptr;
358  // TIMING_END_COMMENT(handleFound_switchEtherCATState,
359  // "ReinitializeSlaves - EtherCATState: preOp (" +
360  // slave->getSlaveIdentifier().getName() + ")")
361  break;
362 
364  // Execute hooks for safeOp to op, similar to the normal bus initialization.
365  slave->prepareForOp();
366  slave->finishPreparingForOp();
367 
368  bus->changeStateOfSlave(index, EtherCATState::op, false);
369 
370  // TIMING_END_COMMENT(handleFound_switchEtherCATState,
371  // "ReinitializeSlaves - EtherCATState: safeOp (" +
372  // slave->getSlaveIdentifier().getName() + ")")
373  break;
374 
376  // This state is reached if the slaves have not been updated for a long time
377  // (>50ms) and the sync manager watchdog gets triggered.
378  // The only way to recover from that is to reinit the slave by switching its
379  // state to init
380  ec_recover_slave(index, 10000);
381 
382  bus->changeStateOfSlave(index, EtherCATState::init, false);
383 
384  // TIMING_END_COMMENT(handleFound_switchEtherCATState,
385  // "ReinitializeSlaves - EtherCATState: safeOpError (" +
386  // slave->getSlaveIdentifier().getName() + ")")
387  break;
388 
389  case EtherCATState::op:
390  if (slave->prepareForRun())
391  {
392  slaveReadyAgain = true;
393  }
394  // TIMING_END_COMMENT(handleFound_switchEtherCATState,
395  // "ReinitializeSlaves - EtherCATState: op (" +
396  // slave->getSlaveIdentifier().getName() + ")")
397  break;
398 
399  default:
400  break;
401  }
402 
403  // Slave was successfully recovered, we can remove it from the list
404  if (slaveReadyAgain)
405  {
406  slaveStates.at(slave) = SlaveState::Operational;
407 
409  << "Slave at index " << slave->getSlaveIdentifier().slaveIndex << " of type "
410  << slave->getSlaveIdentifier().getName()
411  << " has been successfully reinitialized. Current bus iteration number is "
412  << bus->iterationCount;
413 
414  allSlavesReinitialized.store(areAllSlavesReinitialized());
415  }
416  }
417  }
418 
419  void
420  BusErrorHandler::slaveReinitializingLoop()
421  {
422  while (slaveReinitializingThreadRunning.load())
423  {
424  while (!allSlavesReinitialized.load())
425  {
426  reinitializeSlaves();
427  }
428 
429  using namespace std::literals;
430  std::this_thread::sleep_for(100us);
431  }
432  }
433 
434  bool
435  BusErrorHandler::areAllSlavesReinitialized() const
436  {
437  bool r = false;
438  for (const auto& [slave, state] : slaveStates)
439  {
440  r |= (state == SlaveState::Reinitializating);
441  }
442  return !r;
443  }
444 
445  BusErrorHandler::Watchdog::Watchdog(std::uint32_t minDurationUS) :
446  minDuration(minDurationUS), lastTime(armarx::rtNow())
447  {
448  }
449 
450  bool
451  BusErrorHandler::Watchdog::expired() const
452  {
453  return (armarx::rtNow() - lastTime).toMicroSeconds() > minDuration;
454  }
455 
456  void
457  BusErrorHandler::Watchdog::reset()
458  {
459  lastTime = armarx::rtNow();
460  }
461 
462 } // namespace armarx::control::ethercat
armarx::control::ethercat::BusErrorHandler::rtHandleSlaveErrors
void rtHandleSlaveErrors()
Definition: BusErrorHandler.cpp:44
BusErrorHandler.h
armarx::control::ethercat::SlaveInterface
Brief description of class SlaveInterface.
Definition: SlaveInterface.h:27
armarx::control::ethercat::BusErrorHandler::rtHandleBusErrors
void rtHandleBusErrors()
Definition: BusErrorHandler.cpp:37
ARMARX_IMPORTANT
#define ARMARX_IMPORTANT
Definition: Logging.h:190
BUS_WARNING
#define BUS_WARNING(bin,...)
Definition: ErrorReporting.h:274
index
uint8_t index
Definition: EtherCATFrame.h:59
BUS_ERROR
#define BUS_ERROR(bin,...)
Definition: ErrorReporting.h:277
RtTiming.h
armarx::control::ethercat::reporting::Type::Bus
@ Bus
Bussdfnödf.
EtherCATState.h
armarx::control::ethercat::BusErrorHandler::init
void init(int wkc)
Definition: BusErrorHandler.cpp:94
armarx::control::ethercat::BusIO::changeStateOfSlave
EtherCATState changeStateOfSlave(std::uint16_t slaveIndex, EtherCATState state, bool validate=true)
Definition: BusIO.cpp:409
armarx::control::ethercat::BusErrorHandler::~BusErrorHandler
~BusErrorHandler() override
Definition: BusErrorHandler.cpp:30
BUS_INFO
#define BUS_INFO(bin,...)
Definition: ErrorReporting.h:271
armarx::control::ethercat::EtherCATState::invalid
@ invalid
State is not valid, e.g. if a request for reading the actual bus state failed.
Definition: EtherCATState.h:34
armarx::control::ethercat::BusIO::lastWorkCounter
int lastWorkCounter
Definition: BusIO.h:206
armarx::control::ethercat::BusErrorHandler::hasError
bool hasError() const
Definition: BusErrorHandler.cpp:76
armarx::control::ethercat::datatypes::RegisterEnum::CONFIGURED_STATION_ADDRESS
@ CONFIGURED_STATION_ADDRESS
SLAVE_TIMING_START
#define SLAVE_TIMING_START(name)
Definition: Timing.h:186
armarx::control::ethercat::BusIO::readStates
EtherCATState readStates()
Definition: BusIO.cpp:425
ARMARX_CHECK
#define ARMARX_CHECK(expression)
Shortcut for ARMARX_CHECK_EXPRESSION.
Definition: ExpressionException.h:82
Bus.h
armarx::control::ethercat::EtherCATState::safeOpError
@ safeOpError
Safe-operational state after an error has happend.
Definition: EtherCATState.h:51
armarx::control::ethercat::Bus
Brief description of class Bus.
Definition: Bus.h:56
armarx::control::ethercat::datatypes::RegisterEnumTypeContainer
std::variant< EtherCATDataType::INTEGER8, EtherCATDataType::INTEGER16, EtherCATDataType::INTEGER32, EtherCATDataType::INTEGER64, EtherCATDataType::UNSIGNED8, EtherCATDataType::UNSIGNED16, EtherCATDataType::UNSIGNED32, EtherCATDataType::UNSIGNED64 > RegisterEnumTypeContainer
Definition: SlaveRegisters.h:159
armarx::control::ethercat::reporting::Reporting::getErrorReporting
static Reporting & getErrorReporting()
Definition: ErrorReporting.cpp:82
data
uint8_t data[1]
Definition: EtherCATFrame.h:68
BUS_TIMING_CEND
#define BUS_TIMING_CEND(name, bin, thresholdMs)
Definition: Timing.h:181
armarx::control::ethercat
Definition: Bus.cpp:24
armarx::control::ethercat::EtherCATState::preOp
@ preOp
Pre-operational state.
Definition: EtherCATState.h:40
armarx::control::ethercat::reporting::Reporting::getErrorReporter
Reporter getErrorReporter()
Definition: ErrorReporting.cpp:89
armarx::control::ethercat::Bus::getBus
static Bus & getBus()
This returns the one and only Bus object.
Definition: Bus.cpp:29
armarx::control::ethercat::Bus::getSlaves
std::vector< std::experimental::observer_ptr< SlaveInterface > > getSlaves() const
Returns all identifiied slaves on the bus.
Definition: Bus.cpp:1392
SlaveInterface.h
Timing.h
ErrorReporting.h
armarx::control::ethercat::EtherCATState::safeOp
@ safeOp
Safe-operational state.
Definition: EtherCATState.h:46
armarx::control::ethercat::BusErrorHandler::isReinitializationActive
bool isReinitializationActive() const
Definition: BusErrorHandler.cpp:82
TimeUtil.h
armarx::control::ethercat::EtherCATState::init
@ init
Initial state after switch a EtherCAT slave on.
Definition: EtherCATState.h:37
SLAVE_ERROR_LOCAL
#define SLAVE_ERROR_LOCAL(reporter, sid,...)
Definition: ErrorReporting.h:315
armarx::control::ethercat::BusErrorHandler::isSlaveLostOrDuringReinitialization
bool isSlaveLostOrDuringReinitialization(SlaveInterface *slave) const
Definition: BusErrorHandler.cpp:88
armarx::control::ethercat::BusErrorHandler::BusErrorHandler
BusErrorHandler(Bus *bus)
Definition: BusErrorHandler.cpp:23
ControlThreadOutputBuffer.h
armarx::control::ethercat::EtherCATState::op
@ op
Operational state.
Definition: EtherCATState.h:49
BUS_TIMING_START
#define BUS_TIMING_START(name)
Definition: Timing.h:163
SLAVE_TIMING_CEND
#define SLAVE_TIMING_CEND(sid, name, thresholdMs)
Definition: Timing.h:203
armarx::ctrlutil::s
double s(double t, double s0, double v0, double a0, double j)
Definition: CtrlUtil.h:33
armarx
This file offers overloads of toIce() and fromIce() functions for STL container types.
Definition: ArmarXTimeserver.cpp:27
armarx::control::ethercat::BusIO::readRegisters
bool readRegisters(std::vector< RegisterDataList > &registerData)
Definition: BusIO.cpp:439
armarx::rtNow
IceUtil::Time rtNow()
Definition: RtTiming.h:40