← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~cjwatson/launchpad/explicit-proxy-product-release-finder-more into lp:launchpad

 

Colin Watson has proposed merging lp:~cjwatson/launchpad/explicit-proxy-product-release-finder-more into lp:launchpad.

Commit message:
Convert the product release finder's HTTPWalker to urlfetch.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/explicit-proxy-product-release-finder-more/+merge/349175

I missed this when converting the rest of the product release finder.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/explicit-proxy-product-release-finder-more into lp:launchpad.
=== modified file 'lib/lp/registry/scripts/productreleasefinder/walker.py'
--- lib/lp/registry/scripts/productreleasefinder/walker.py	2018-05-06 08:52:34 +0000
+++ lib/lp/registry/scripts/productreleasefinder/walker.py	2018-07-09 11:53:19 +0000
@@ -13,10 +13,8 @@
     ]
 
 import ftplib
-import os
 import socket
 from urllib import unquote_plus
-import urllib2
 from urlparse import (
     urljoin,
     urlsplit,
@@ -30,10 +28,16 @@
     InvalidURIError,
     URI,
     )
+import requests
 import scandir
 
 from lp.registry.scripts.productreleasefinder import log
 from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.config import config
+from lp.services.timeout import (
+    TimeoutError,
+    urlfetch,
+    )
 from lp.services.webapp.url import urlappend
 
 
@@ -47,19 +51,6 @@
     pass
 
 
-class Request(urllib2.Request):
-    """A urllib2 Request object that can override the request method."""
-
-    method = None
-
-    def get_method(self):
-        """See `urllib2.Request`."""
-        if self.method is not None:
-            return self.method
-        else:
-            return urllib2.Request.get_method(self)
-
-
 class WalkerBase:
     """Base class for URL walkers.
 
@@ -278,21 +269,6 @@
     # Whether to ignore or parse fragments in the URL
     FRAGMENTS = True
 
-    # All the urls handlers used to support the schemas. Redirects are not
-    # supported.
-    handlers = (
-        urllib2.ProxyHandler,
-        urllib2.UnknownHandler,
-        urllib2.HTTPHandler,
-        urllib2.HTTPDefaultErrorHandler,
-        urllib2.HTTPSHandler,
-        urllib2.HTTPDefaultErrorHandler,
-        urllib2.FTPHandler,
-        urllib2.FileHandler,
-        urllib2.HTTPErrorProcessor)
-
-    _opener = None
-
     def open(self):
         """Open the HTTP connection."""
         self.log.info('Walking %s://%s', self.scheme, self.host)
@@ -304,19 +280,12 @@
     def request(self, method, path):
         """Make an HTTP request.
 
-        Returns the HTTPResponse object.
+        Returns the Response object.
         """
-        # We build a custom opener, because we don't want redirects to be
-        # followed.
-        if self._opener is None:
-            self._opener = urllib2.OpenerDirector()
-            for handler in self.handlers:
-                self._opener.add_handler(handler())
-
         self.log.debug("Requesting %s with method %s", path, method)
-        request = Request(urljoin(self.base, path))
-        request.method = method
-        return self._opener.open(request)
+        return urlfetch(
+            urljoin(self.base, path), method=method, allow_redirects=False,
+            trust_env=False, use_proxy=True, allow_ftp=True)
 
     def isDirectory(self, path):
         """Return whether the path is a directory.
@@ -335,19 +304,15 @@
 
         self.log.debug("Checking if %s is a directory" % path)
         try:
-            self.request("HEAD", path)
+            response = self.request("HEAD", path)
+        except (TimeoutError, requests.RequestException) as exc:
+            raise HTTPWalkerError(str(exc))
+
+        if not response.is_redirect or "location" not in response.headers:
             return False
-        except urllib2.HTTPError as exc:
-            if exc.code != 301:
-                return False
-        except (IOError, socket.error) as exc:
-            # Raise HTTPWalkerError for other IO or socket errors.
-            raise HTTPWalkerError(str(exc))
-
-        # We have a 301 redirect error from here on.
-        url = exc.hdrs.getheader("location")
-        (scheme, netloc, redirect_path, query, fragment) \
-                 = urlsplit(url, self.scheme, self.FRAGMENTS)
+        url = response.headers["location"]
+        scheme, netloc, redirect_path, _, _ = urlsplit(
+            url, self.scheme, self.FRAGMENTS)
 
         if len(scheme) and scheme != self.scheme:
             return False
@@ -368,11 +333,8 @@
         self.log.info("Listing %s" % dirname)
         try:
             response = self.request("GET", dirname)
-            try:
-                soup = BeautifulSoup(response.read())
-            finally:
-                response.close()
-        except (IOError, socket.error) as exc:
+            soup = BeautifulSoup(response.content)
+        except (TimeoutError, requests.RequestException) as exc:
             raise HTTPWalkerError(str(exc))
 
         base = URI(self.base).resolve(dirname)
@@ -413,9 +375,9 @@
     """Return a walker for the URL given."""
     (scheme, netloc, path, query, fragment) = urlsplit(url, "file")
     if scheme in ["ftp"]:
-        # If ftp_proxy is set, use the HTTPWalker class since we are
-        # talking to an HTTP proxy.
-        if 'ftp_proxy' in os.environ:
+        # If an HTTP proxy is configured, use the HTTPWalker class so that
+        # FTP requests are made via the proxy.
+        if config.launchpad.http_proxy:
             return HTTPWalker(url, log_parent)
         else:
             return FTPWalker(url, log_parent)

=== modified file 'lib/lp/registry/tests/test_prf_walker.py'
--- lib/lp/registry/tests/test_prf_walker.py	2018-01-02 16:10:26 +0000
+++ lib/lp/registry/tests/test_prf_walker.py	2018-07-09 11:53:19 +0000
@@ -1,15 +1,24 @@
-# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2018 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests for lp.registry.scripts.productreleasefinder.walker."""
 
 import logging
 import StringIO
-import urlparse
-
-from lazr.restful.utils import safe_hasattr
-
-from lp.registry.scripts.productreleasefinder.walker import WalkerBase
+
+import responses
+
+from lp.registry.scripts.productreleasefinder.walker import (
+    combine_url,
+    FTPWalker,
+    HTTPWalker,
+    WalkerBase,
+    WalkerError,
+    )
+from lp.services.timeout import (
+    get_default_timeout_function,
+    set_default_timeout_function,
+    )
 from lp.testing import (
     reset_logging,
     TestCase,
@@ -20,14 +29,12 @@
 
     def testCreatesDefaultLogger(self):
         """WalkerBase creates a default logger."""
-        from logging import Logger
         w = WalkerBase("/")
-        self.assertTrue(isinstance(w.log, Logger))
+        self.assertTrue(isinstance(w.log, logging.Logger))
 
     def testCreatesChildLogger(self):
         """WalkerBase creates a child logger if given a parent."""
-        from logging import getLogger
-        parent = getLogger("foo")
+        parent = logging.getLogger("foo")
         w = WalkerBase("/", log_parent=parent)
         self.assertEqual(w.log.parent, parent)
 
@@ -56,8 +63,6 @@
 
     def testWrongScheme(self):
         """WalkerBase raises WalkerError when given an unhandled scheme."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            WalkerBase, WalkerError)
         self.assertRaises(WalkerError, WalkerBase, "foo://localhost/")
 
     def testUnescapesHost(self):
@@ -201,35 +206,25 @@
 
     def testFtpScheme(self):
         """FTPWalker works when initialized with an ftp-scheme URL."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            FTPWalker)
         w = FTPWalker("ftp://localhost/";)
         self.assertEqual(w.host, "localhost")
 
     def testNoScheme(self):
         """FTPWalker works when given a URL with no scheme."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            FTPWalker)
         w = FTPWalker("/")
         self.assertEqual(w.host, "")
 
     def testWrongScheme(self):
         """FTPWalker raises WalkerError when given an unhandled scheme."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            FTPWalker, WalkerError)
         self.assertRaises(WalkerError, FTPWalker, "http://localhost/";)
 
     def testNoUsername(self):
         """FTPWalker stores 'anonymous' when there is no username."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            FTPWalker)
         w = FTPWalker("ftp://localhost/";)
         self.assertEqual(w.user, "anonymous")
 
     def testNoPassword(self):
         """FTPWalker stores empty string when there is no password."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            FTPWalker)
         w = FTPWalker("ftp://scott@localhost/";)
         self.assertEqual(w.passwd, "")
 
@@ -238,88 +233,35 @@
 
     def testHttpScheme(self):
         """HTTPWalker works when initialized with an http-scheme URL."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
         w = HTTPWalker("http://localhost/";)
         self.assertEqual(w.host, "localhost")
 
     def testHttpsScheme(self):
         """HTTPWalker works when initialized with an https-scheme URL."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
         w = HTTPWalker("https://localhost/";)
         self.assertEqual(w.host, "localhost")
 
     def testNoScheme(self):
         """HTTPWalker works when given a URL with no scheme."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
         w = HTTPWalker("/")
         self.assertEqual(w.host, "")
 
     def testWrongScheme(self):
         """HTTPWalker raises WalkerError when given an unhandled scheme."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker, WalkerError)
         self.assertRaises(WalkerError, HTTPWalker, "foo://localhost/")
 
 
-class HTTPWalker_url_schemes_and_handlers(TestCase):
-    """Verify there is a handler for each URL scheme."""
+class HTTPWalker_ListDir(TestCase):
 
     def setUp(self):
-        TestCase.setUp(self)
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
-        self.walker = HTTPWalker("http://localhost/";)
-
-    def verify_url_scheme_and_handler(self, scheme, handler):
-        self.assertIn(scheme, self.walker.URL_SCHEMES)
-        self.assertIn(handler, self.walker.handlers)
-        # urllib2 uses a naming convention to select the handler for
-        # a URL scheme. This test is sanity to check to ensure that the
-        # HTTPWalker's configuration of the OpenerDirector is will work.
-        method_name = '%s_open' % scheme
-        self.assertTrue(safe_hasattr(handler, method_name))
-
-    def test_http_request(self):
-        import urllib2
-        self.verify_url_scheme_and_handler('http', urllib2.HTTPHandler)
-
-    def test_https_request(self):
-        import urllib2
-        self.verify_url_scheme_and_handler('https', urllib2.HTTPSHandler)
-
-    def test_ftp_request(self):
-        import urllib2
-        self.verify_url_scheme_and_handler('ftp', urllib2.FTPHandler)
-
-
-class HTTPWalker_ListDir(TestCase):
-
-    def tearDown(self):
-        reset_logging()
-        super(HTTPWalker_ListDir, self).tearDown()
-
-    def setUpWalker(self, listing_url, listing_content):
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
-        test = self
-
-        class TestHTTPWalker(HTTPWalker):
-
-            def request(self, method, path):
-                test.assertEqual(method, 'GET')
-                test.assertEqual(urlparse.urljoin(self.base, path),
-                                 listing_url)
-                return StringIO.StringIO(listing_content)
-
-            def isDirectory(self, path):
-                return path.endswith('/')
-
-        logging.basicConfig(level=logging.CRITICAL)
-        return TestHTTPWalker(listing_url, logging.getLogger())
-
+        super(HTTPWalker_ListDir, self).setUp()
+        self.addCleanup(reset_logging)
+        original_timeout_function = get_default_timeout_function()
+        set_default_timeout_function(lambda: 60.0)
+        self.addCleanup(
+            set_default_timeout_function, original_timeout_function)
+
+    @responses.activate
     def testApacheListing(self):
         # Test that list() handles a standard Apache dir listing.
         content = '''
@@ -341,15 +283,22 @@
 <address>Apache/2.2.3 (Unix) Server at <a href="mailto:ftp-adm@xxxxxxxxxx";>ftp.acc.umu.se</a> Port 80</address>
 </body></html>
         '''
-        walker = self.setUpWalker(
-            'http://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/', content)
+        listing_url = 'http://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/'
+        responses.add('GET', listing_url, body=content)
+        expected_filenames = [
+            'LATEST-IS-0.5.0',
+            'gnome-gpg-0.5.0.md5sum',
+            'gnome-gpg-0.5.0.tar.bz2',
+            'gnome-gpg-0.5.0.tar.gz',
+            ]
+        for filename in expected_filenames:
+            responses.add('HEAD', listing_url + filename)
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/pub/GNOME/sources/gnome-gpg/0.5/')
         self.assertEqual(dirnames, [])
-        self.assertEqual(filenames, ['LATEST-IS-0.5.0',
-                                     'gnome-gpg-0.5.0.md5sum',
-                                     'gnome-gpg-0.5.0.tar.bz2',
-                                     'gnome-gpg-0.5.0.tar.gz'])
+        self.assertEqual(filenames, expected_filenames)
 
+    @responses.activate
     def testSquidFtpListing(self):
         # Test that a Squid FTP listing can be parsed.
         content = '''
@@ -375,8 +324,9 @@
 Generated Wed, 06 Sep 2006 11:04:02 GMT by squid (squid/2.5.STABLE12)
 </ADDRESS></BODY></HTML>
         '''
-        walker = self.setUpWalker(
-            'ftp://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/', content)
+        listing_url = 'ftp://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/'
+        responses.add('GET', listing_url, body=content)
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/pub/GNOME/sources/gnome-gpg/0.5/')
         self.assertEqual(dirnames, [])
         self.assertEqual(filenames, ['LATEST-IS-0.5.0',
@@ -384,9 +334,10 @@
                                      'gnome-gpg-0.5.0.tar.bz2',
                                      'gnome-gpg-0.5.0.tar.gz'])
 
+    @responses.activate
     def testNonAsciiListing(self):
         # Test that list() handles non-ASCII output.
-        content = '''
+        content = b'''
         <html>
           <head>
             <title>Listing</title>
@@ -409,11 +360,17 @@
           </pre>
         </html>
         '''
-        walker = self.setUpWalker('http://example.com/foo/', content)
+        listing_url = 'http://example.com/foo/'
+        responses.add('GET', listing_url, body=content)
+        expected_filenames = ['file1', 'file2', 'file3', 'file99']
+        for filename in expected_filenames:
+            responses.add('HEAD', listing_url + filename)
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/foo/')
         self.assertEqual(dirnames, ['subdir1/', 'subdir2/', 'subdir3/'])
-        self.assertEqual(filenames, ['file1', 'file2', 'file3', 'file99'])
+        self.assertEqual(filenames, expected_filenames)
 
+    @responses.activate
     def testDotPaths(self):
         # Test that paths containing dots are handled correctly.
         #
@@ -436,11 +393,15 @@
           </pre>
         </html>
         '''
-        walker = self.setUpWalker('http://example.com/foo/', content)
+        listing_url = 'http://example.com/foo/'
+        responses.add('GET', listing_url, body=content)
+        responses.add('HEAD', listing_url + 'file2')
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/foo/')
         self.assertEqual(dirnames, ['dir/'])
         self.assertEqual(filenames, ['file2'])
 
+    @responses.activate
     def testNamedAnchors(self):
         # Test that the directory listing parser code handles named anchors.
         # These are <a> tags without an href attribute.
@@ -458,15 +419,21 @@
           </pre>
         </html>
         '''
-        walker = self.setUpWalker('http://example.com/foo/', content)
+        listing_url = 'http://example.com/foo/'
+        responses.add('GET', listing_url, body=content)
+        responses.add('HEAD', listing_url + 'file1')
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/foo/')
         self.assertEqual(dirnames, ['dir1/'])
         self.assertEqual(filenames, ['file1'])
 
+    @responses.activate
     def testGarbageListing(self):
         # Make sure that garbage doesn't trip up the dir lister.
-        content = '\x01\x02\x03\x00\xff\xf2\xablkjsdflkjsfkljfds'
-        walker = self.setUpWalker('http://example.com/foo/', content)
+        content = b'\x01\x02\x03\x00\xff\xf2\xablkjsdflkjsfkljfds'
+        listing_url = 'http://example.com/foo/'
+        responses.add('GET', listing_url, body=content)
+        walker = HTTPWalker(listing_url, logging.getLogger())
         dirnames, filenames = walker.list('/foo/')
         self.assertEqual(dirnames, [])
         self.assertEqual(filenames, [])
@@ -481,8 +448,6 @@
     def testFtpIsDirectory(self):
         # Test that no requests are made by isDirectory() when walking
         # FTP sites.
-        from lp.registry.scripts.productreleasefinder.walker import (
-            HTTPWalker)
         test = self
 
         class TestHTTPWalker(HTTPWalker):
@@ -501,8 +466,6 @@
 
     def testConstructsUrl(self):
         """combine_url constructs the URL correctly."""
-        from lp.registry.scripts.productreleasefinder.walker import (
-            combine_url)
         self.assertEqual(combine_url("file:///base", "/subdir/", "file"),
                          "file:///subdir/file")
         self.assertEqual(combine_url("file:///base", "/subdir", "file"),


Follow ups