← Back to team overview

yahoo-eng-team team mailing list archive

[Bug 1247603] Re: nova-conductor process can't create cosumer connection to qpid after HeartbeatTimeout in heavy workload

 

Hit this today on latest Havana build, logs below.  I reproduced doing
some stress testing; create 50 instances boot from volume in one
operation.  Need to try it in my Icehouse setup next.

2014-03-20 00:42:51.725 17580 INFO nova.compute.manager [req-ef61a326-288b-494d-9d30-f533e7739949 None None] Updating bandwidth usage cache
2014-03-20 00:43:51.843 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources
2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773
2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49
2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48
2014-03-20 00:43:52.677 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net
2014-03-20 00:44:52.808 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources
2014-03-20 00:44:53.589 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773
2014-03-20 00:44:53.589 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49
2014-03-20 00:44:53.590 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48
2014-03-20 00:44:53.657 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net
2014-03-20 08:47:05.277 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to publish message to topic 'conductor': heartbeat timeout
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last):
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     return method(*args, **kwargs)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 632, in _publisher_send
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     publisher = cls(self.conf, self.session, topic)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 398, in __init__
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     super(TopicPublisher, self).__init__(conf, session, node_name)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 328, in __init__
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     self.reconnect(session)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 332, in reconnect
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     self.sender = session.sender(self.address)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "<string>", line 6, in sender
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 592, in sender
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     sender._ewait(lambda: sender.linked)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 799, in _ewait
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self.session._ewait(lambda: self.error or predicate(), timeout)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self.connection._ewait(lambda: self.error or predicate(), timeout)
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     self.check_error()
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid     raise self.error
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout
2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid
2014-03-20 08:47:05.295 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to consume message from queue: heartbeat timeout
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last):
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     return method(*args, **kwargs)
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 597, in _consume
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     nxt_receiver = self.session.next_receiver(timeout=timeout)
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "<string>", line 6, in next_receiver
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 660, in next_receiver
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     if self._ecwait(lambda: self.incoming, timeout):
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 50, in _ecwait
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self._ewait(lambda: self.closed or predicate(), timeout)
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self.connection._ewait(lambda: self.error or predicate(), timeout)
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     self.check_error()
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid     raise self.error
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout
2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid
2014-03-20 08:47:05.301 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to consume message from queue: heartbeat timeout
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last):
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     return method(*args, **kwargs)
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 597, in _consume
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     nxt_receiver = self.session.next_receiver(timeout=timeout)
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "<string>", line 6, in next_receiver
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 660, in next_receiver
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     if self._ecwait(lambda: self.incoming, timeout):
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 50, in _ecwait
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self._ewait(lambda: self.closed or predicate(), timeout)
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     result = self.connection._ewait(lambda: self.error or predicate(), timeout)
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     self.check_error()
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid   File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid     raise self.error
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout
2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid
2014-03-20 08:47:05.326 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672
2014-03-20 08:47:05.330 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672
2014-03-20 08:47:05.332 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672
2014-03-20 08:47:05.335 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672
2014-03-20 08:47:05.458 17580 INFO nova.compute.manager [-] Running instance usage audit for host os-1.solidfire.net from 2014-03-20 13:00:00 to 2014-03-20 14:00:00. 0 instances.
2014-03-20 08:47:05.532 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources
2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773
2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49
2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48
2014-03-20 08:47:06.336 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net
2014-03-20 08:47:06.482 17580 INFO nova.compute.manager [-] Updating bandwidth usage cache
2014-03-20 08:48:06.692 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources

** Changed in: nova
       Status: Invalid => New

-- 
You received this bug notification because you are a member of Yahoo!
Engineering Team, which is subscribed to OpenStack Compute (nova).
https://bugs.launchpad.net/bugs/1247603

Title:
  nova-conductor process can't create cosumer connection to qpid after
  HeartbeatTimeout in heavy workload

Status in OpenStack Compute (Nova):
  New

Bug description:
  nova-conductor will lose the queue and not able to get requests
  anymore after running workload for some time. This also occured in
  process nova-compute. They share same impl_qpid.py.

  When nova-conductor with heavy workload,  a  exception
  HeartbeatTimeout will be raised.  The exceptin will be caught and try
  to reconnect to qpid server.  logs shows we can't reconnect qoid  in
  method iterconsume , but can reconnect qpid server in method
  publisher_send. That means we can't only send message to the qpid
  queue, but can't receive message from qpid queue.

  impl_qpid.py 
      def ensure(self, error_callback, method, *args, **kwargs):
          while True:
              try:
                  return method(*args, **kwargs)          ---------------------------> raise  HeartbeatTimeout
              except (qpid_exceptions.Empty,
                      qpid_exceptions.ConnectionError), e:
                  if error_callback:
                      error_callback(e)
                  self.reconnect()     ------------------------------> retry 

  
  method ensure is used in 

      def iterconsume(self, limit=None, timeout=None):
          """Return an iterator that will consume from all queues/consumers"""

          def _error_callback(exc):
              if isinstance(exc, qpid_exceptions.Empty):
                  LOG.debug(_('Timed out waiting for RPC response: %s') %
                            str(exc))
                  raise rpc_common.Timeout()
              else:
                  LOG.exception(_('Failed to consume message from queue: %s') %
                                str(exc))

          def _consume():
              nxt_receiver = self.session.next_receiver(timeout=timeout)
              try:
                  self._lookup_consumer(nxt_receiver).consume()
              except Exception:
                  LOG.exception(_("Error processing message.  Skipping it."))

          for iteration in itertools.count(0):
              if limit and iteration >= limit:
                  raise StopIteration
              yield self.ensure(_error_callback, _consume)   ---------------------->   here can't reconnect 

  
  and 

      def publisher_send(self, cls, topic, msg):
          """Send to a publisher based on the publisher class"""

          def _connect_error(exc):
              log_info = {'topic': topic, 'err_str': str(exc)}
              LOG.exception(_("Failed to publish message to topic "
                            "'%(topic)s': %(err_str)s") % log_info)

          def _publisher_send():
              publisher = cls(self.conf, self.session, topic)
              publisher.send(msg)

          return self.ensure(_connect_error, _publisher_send)
  ------------------> here can reconnect.

To manage notifications about this bug go to:
https://bugs.launchpad.net/nova/+bug/1247603/+subscriptions