launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~cjwatson/launchpad/bs4-initial into lp:launchpad

To: mp+366479@xxxxxxxxxxxxxxxxxx
From: Colin Watson <cjwatson@xxxxxxxxxxxxx>
Date: Wed, 24 Apr 2019 16:32:40 -0000
Reply-to: mp+366479@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Colin Watson has proposed merging lp:~cjwatson/launchpad/bs4-initial into lp:launchpad.

Commit message:
Add beautifulsoup4 and convert some initial tests to it.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/bs4-initial/+merge/366479

There's some slightly ugly duplication here, but it'll be fixed once we finish the conversion.

I've been meaning to get started on this for a while, but moved it up my list when I found that we're currently pulling in an older version of beautifulsoup4 via soupmatchers which is incompatible with bionic's html5lib (https://bugs.launchpad.net/beautifulsoup/+bug/1603299).
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/bs4-initial into lp:launchpad.

=== modified file 'constraints.txt'
--- constraints.txt	2019-04-23 14:02:52 +0000
+++ constraints.txt	2019-04-24 16:24:33 +0000
@@ -227,9 +227,11 @@
 auditorfixture==0.0.7
 Automat==0.6.0
 Babel==2.5.1
+backports.functools-lru-cache==1.5
 backports.lzma==0.0.3
 bcrypt==3.1.4
 BeautifulSoup==3.2.1
+beautifulsoup4[lxml]==4.7.1
 billiard==3.5.0.5
 bson==0.3.3
 bzr==2.6.0.lp.3
@@ -352,6 +354,7 @@
 six==1.12.0
 snowballstemmer==1.2.1
 soupmatchers==0.4
+soupsieve==1.9
 sphinxcontrib-websupport==1.0.1
 # lp:~launchpad-committers/storm/lp
 storm==0.20.0.99-lp-r411

=== modified file 'lib/lp/answers/tests/test_question_webservice.py'
--- lib/lp/answers/tests/test_question_webservice.py	2017-11-10 11:28:43 +0000
+++ lib/lp/answers/tests/test_question_webservice.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2011-2016 Canonical Ltd.  This software is licensed under the
+# Copyright 2011-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Webservice unit tests related to Launchpad Questions."""
@@ -29,7 +29,7 @@
     NotQuestionOwnerError,
     QuestionTargetError,
     )
-from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
 from lp.services.webapp.interfaces import OAuthPermission
 from lp.testing import (
     admin_logged_in,
@@ -109,7 +109,7 @@
     def findQuestionTitle(self, response):
         """Find the question title field in an XHTML document fragment."""
         soup = BeautifulSoup(response.body)
-        dt = soup.find('dt', text="title").parent
+        dt = soup.find('dt', text="title")
         dd = dt.findNextSibling('dd')
         return str(dd.contents.pop())
 

=== modified file 'lib/lp/app/browser/tests/test_base_layout.py'
--- lib/lp/app/browser/tests/test_base_layout.py	2017-10-21 18:14:14 +0000
+++ lib/lp/app/browser/tests/test_base_layout.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2010 Canonical Ltd.  This software is licensed under the
+# Copyright 2010-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests for base-layout.pt and its macros.
@@ -16,7 +16,7 @@
 from z3c.ptcompat import ViewPageTemplateFile
 
 from lp.registry.interfaces.person import PersonVisibility
-from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
 from lp.services.webapp.publisher import LaunchpadView
 from lp.services.webapp.servers import LaunchpadTestRequest
 from lp.testing import (
@@ -85,7 +85,7 @@
         self.assertTrue(head.title.string.startswith(view.page_title))
         # The shortcut icon for the browser chrome is provided.
         link_tag = head.link
-        self.assertEqual('shortcut icon', link_tag['rel'])
+        self.assertEqual(['shortcut', 'icon'], link_tag['rel'])
         self.assertEqual('/@@/launchpad.png', link_tag['href'])
         # The template loads the common scripts.
         load_script = find_tag_by_id(head, 'base-layout-load-scripts').name
@@ -97,17 +97,17 @@
         yui_layout = document.find('div', 'yui-d0')
         self.assertTrue(yui_layout is not None)
         self.assertEqual(
-            'login-logout', yui_layout.find(True, id='locationbar')['class'])
-        self.assertEqual(
-            'yui-main', yui_layout.find(True, id='maincontent')['class'])
-        self.assertEqual(
-            'footer', yui_layout.find(True, id='footer')['class'])
+            ['login-logout'], yui_layout.find(True, id='locationbar')['class'])
+        self.assertEqual(
+            ['yui-main'], yui_layout.find(True, id='maincontent')['class'])
+        self.assertEqual(
+            ['footer'], yui_layout.find(True, id='footer')['class'])
 
     def verify_watermark(self, document):
         # Verify the parts of a watermark.
         yui_layout = document.find('div', 'yui-d0')
         watermark = yui_layout.find(True, id='watermark')
-        self.assertEqual('watermark-apps-portlet', watermark['class'])
+        self.assertEqual(['watermark-apps-portlet'], watermark['class'])
         if self.context.is_team:
             self.assertEqual('/@@/team-logo', watermark.img['src'])
             self.assertEqual(
@@ -115,7 +115,7 @@
         else:
             self.assertEqual('/@@/person-logo', watermark.img['src'])
             self.assertEqual('Waffles', watermark.h2.a.string)
-        self.assertEqual('facetmenu', watermark.ul['class'])
+        self.assertEqual(['facetmenu'], watermark.ul['class'])
 
     def test_main_side(self):
         # The main_side layout has everything.
@@ -127,10 +127,10 @@
         document = find_tag_by_id(content, 'document')
         self.verify_base_layout_body_parts(document)
         classes = 'tab-overview main_side public yui3-skin-sam'.split()
-        self.assertEqual(classes, document['class'].split())
+        self.assertEqual(classes, document['class'])
         self.verify_watermark(document)
         self.assertEqual(
-            'registering', document.find(True, id='registration')['class'])
+            ['registering'], document.find(True, id='registration')['class'])
         self.assertEqual(
             'Registered on 2005-09-16 by Illuminati',
             document.find(True, id='registration').string.strip(),
@@ -139,7 +139,8 @@
             extract_text(document.find(True, id='maincontent')),
             'Main content of the page.')
         self.assertEqual(
-            'yui-b side', document.find(True, id='side-portlets')['class'])
+            ['yui-b', 'side'],
+            document.find(True, id='side-portlets')['class'])
         self.assertEqual('form', document.find(True, id='globalsearch').name)
 
     def test_main_only(self):
@@ -151,10 +152,10 @@
         document = find_tag_by_id(content, 'document')
         self.verify_base_layout_body_parts(document)
         classes = 'tab-overview main_only public yui3-skin-sam'.split()
-        self.assertEqual(classes, document['class'].split())
+        self.assertEqual(classes, document['class'])
         self.verify_watermark(document)
         self.assertEqual(
-            'registering', document.find(True, id='registration')['class'])
+            ['registering'], document.find(True, id='registration')['class'])
         self.assertEqual(None, document.find(True, id='side-portlets'))
         self.assertEqual('form', document.find(True, id='globalsearch').name)
 
@@ -168,9 +169,9 @@
         self.verify_base_layout_body_parts(document)
         self.verify_watermark(document)
         classes = 'tab-overview searchless public yui3-skin-sam'.split()
-        self.assertEqual(classes, document['class'].split())
+        self.assertEqual(classes, document['class'])
         self.assertEqual(
-            'registering', document.find(True, id='registration')['class'])
+            ['registering'], document.find(True, id='registration')['class'])
         self.assertEqual(None, document.find(True, id='side-portlets'))
         self.assertEqual(None, document.find(True, id='globalsearch'))
 
@@ -180,7 +181,7 @@
         view._user = self.user
         content = BeautifulSoup(view())
         footer = find_tag_by_id(content, 'footer')
-        link = footer.find('a', text='Contact Launchpad Support').parent
+        link = footer.find('a', text='Contact Launchpad Support')
         self.assertEqual('/support', link['href'])
 
     def test_contact_support_anonymous(self):
@@ -189,7 +190,7 @@
         view._user = None
         content = BeautifulSoup(view())
         footer = find_tag_by_id(content, 'footer')
-        link = footer.find('a', text='Contact Launchpad Support').parent
+        link = footer.find('a', text='Contact Launchpad Support')
         self.assertEqual('/feedback', link['href'])
 
     def test_user_without_launchpad_view(self):

=== modified file 'lib/lp/app/browser/tests/test_launchpadroot.py'
--- lib/lp/app/browser/tests/test_launchpadroot.py	2018-01-02 16:10:26 +0000
+++ lib/lp/app/browser/tests/test_launchpadroot.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2010-2017 Canonical Ltd.  This software is licensed under the
+# Copyright 2010-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests related to ILaunchpadRoot."""
@@ -14,8 +14,8 @@
 from lp.registry.interfaces.person import IPersonSet
 from lp.registry.interfaces.pillar import IPillarNameSet
 from lp.services.beautifulsoup import (
-    BeautifulSoup,
-    SoupStrainer,
+    BeautifulSoup4 as BeautifulSoup,
+    SoupStrainer4 as SoupStrainer,
     )
 from lp.services.config import config
 from lp.services.features.testing import FeatureFixture
@@ -93,7 +93,7 @@
         # Stub out the getRecentBlogPosts which fetches a blog feed using
         # urlfetch.
         view.getRecentBlogPosts = lambda: []
-        content = BeautifulSoup(view(), parseOnlyThese=SoupStrainer('a'))
+        content = BeautifulSoup(view(), parse_only=SoupStrainer('a'))
         self.assertTrue(
             content.find('a', href='+featuredprojects'),
             "Cannot find the +featuredprojects link on the first page")
@@ -142,8 +142,7 @@
         view = create_initialized_view(root, 'index.html', principal=user)
         # Replace the blog posts so the view does not make a network request.
         view.getRecentBlogPosts = lambda: []
-        markup = BeautifulSoup(
-            view(), parseOnlyThese=SoupStrainer(id='document'))
+        markup = BeautifulSoup(view(), parse_only=SoupStrainer(id='document'))
         self.assertIs(False, view.has_watermark)
         self.assertIs(None, markup.find(True, id='watermark'))
         logo = markup.find(True, id='launchpad-logo-and-name')
@@ -177,8 +176,8 @@
             view = create_initialized_view(root, 'index.html')
             view.getRecentBlogPosts = _get_blog_posts
             result = view()
-        markup = BeautifulSoup(result,
-            parseOnlyThese=SoupStrainer(id='homepage-blogposts'))
+        markup = BeautifulSoup(
+            result, parse_only=SoupStrainer(id='homepage-blogposts'))
         self.assertEqual(['called'], calls)
         items = markup.findAll('li', 'news')
         # Notice about launchpad being opened is always added at the end
@@ -204,7 +203,7 @@
         view = create_initialized_view(root, 'index.html', principal=user)
         view.getRecentBlogPosts = _get_blog_posts
         markup = BeautifulSoup(
-            view(), parseOnlyThese=SoupStrainer(id='homepage'))
+            view(), parse_only=SoupStrainer(id='homepage'))
         self.assertEqual([], calls)
         self.assertIs(None, markup.find(True, id='homepage-blogposts'))
         # Even logged in users should get the launchpad intro text in the left
@@ -225,8 +224,8 @@
         with anonymous_logged_in():
             view = create_initialized_view(root, 'index.html')
             result = view()
-        markup = BeautifulSoup(result,
-            parseOnlyThese=SoupStrainer(id='homepage-blogposts'))
+        markup = BeautifulSoup(
+            result, parse_only=SoupStrainer(id='homepage-blogposts'))
         items = markup.findAll('li', 'news')
         self.assertEqual(3, len(items))
 

=== modified file 'lib/lp/app/widgets/doc/launchpad-radio-widget.txt'
--- lib/lp/app/widgets/doc/launchpad-radio-widget.txt	2017-10-21 18:14:14 +0000
+++ lib/lp/app/widgets/doc/launchpad-radio-widget.txt	2019-04-24 16:24:33 +0000
@@ -10,7 +10,7 @@
 The LaunchpadRadioWidget is mostly used to display items from
 an enumerated type.
 
-    >>> from lp.services.beautifulsoup import BeautifulSoup
+    >>> from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
     >>> from lp.services.webapp.servers import LaunchpadTestRequest
     >>> from lp.code.interfaces.branch import IBranch
     >>> branch = factory.makeAnyBranch()
@@ -25,15 +25,15 @@
 
     >>> html = BeautifulSoup(radio_widget())
     >>> for label in html.findAll('label'):
-    ...     print label.renderContents()
-    <input class="radioType" checked="checked" id="field.branch_type.0"
-           name="field.branch_type" type="radio" value="HOSTED" />&nbsp;Hosted
+    ...     print label.encode_contents(formatter='html')
+    <input checked="checked" class="radioType" id="field.branch_type.0"
+           name="field.branch_type" type="radio" value="HOSTED"/>&nbsp;Hosted
     <input class="radioType" id="field.branch_type.1" name="field.branch_type"
-           type="radio" value="MIRRORED" />&nbsp;Mirrored
+           type="radio" value="MIRRORED"/>&nbsp;Mirrored
     <input class="radioType" id="field.branch_type.2" name="field.branch_type"
-           type="radio" value="IMPORTED" />&nbsp;Imported
+           type="radio" value="IMPORTED"/>&nbsp;Imported
     <input class="radioType" id="field.branch_type.3" name="field.branch_type"
-           type="radio" value="REMOTE" />&nbsp;Remote
+           type="radio" value="REMOTE"/>&nbsp;Remote
 
 
 LaunchpadRadioWidgetWithDescription

=== modified file 'lib/lp/blueprints/stories/sprints/xx-sprint-meeting-export.txt'
--- lib/lp/blueprints/stories/sprints/xx-sprint-meeting-export.txt	2013-04-16 01:18:10 +0000
+++ lib/lp/blueprints/stories/sprints/xx-sprint-meeting-export.txt	2019-04-24 16:24:33 +0000
@@ -30,9 +30,9 @@
 
 The attendees element contains a list of person elements.
 
-    >>> from BeautifulSoup import BeautifulStoneSoup as BSS
     >>> import operator
-    >>> soup = BSS(browser.contents)
+    >>> from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
+    >>> soup = BeautifulSoup(browser.contents, 'xml')
     >>> people = soup.find('attendees').findAll('person')
     >>> for person in sorted(people, key=operator.itemgetter("displayname")):
     ...     print "%(displayname)s, %(name)s, %(start)s -> %(end)s" % person
@@ -42,7 +42,7 @@
 The <unscheduled /> element contains a list of meetings. Each of these
 actually refers to a Specification.
 
-    >>> soup = BSS(browser.contents)
+    >>> soup = BeautifulSoup(browser.contents, 'xml')
     >>> meetings = soup.find('unscheduled').findAll('meeting')
     >>> for meeting in meetings:
     ...     print "%(id)s: %(name)s, %(lpurl)s" % meeting

=== modified file 'lib/lp/bugs/browser/tests/test_bug_views.py'
--- lib/lp/bugs/browser/tests/test_bug_views.py	2018-01-02 16:10:26 +0000
+++ lib/lp/bugs/browser/tests/test_bug_views.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2011-2012 Canonical Ltd.  This software is licensed under the
+# Copyright 2011-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests for Bug Views."""
@@ -37,7 +37,7 @@
     IAccessPolicySource,
     )
 from lp.registry.interfaces.person import PersonVisibility
-from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
 from lp.services.webapp.interfaces import IOpenLaunchBag
 from lp.services.webapp.publisher import canonical_url
 from lp.services.webapp.servers import LaunchpadTestRequest
@@ -247,7 +247,7 @@
         # a CSS class `css_class`.
         soup = BeautifulSoup(html)
         element = soup.find(attrs={'id': element_id})
-        return css_class in element.get('class', '').split(' ')
+        return css_class in element.get('class', [])
 
     def test_bug_mute_for_individual_structural_subscription(self):
         # If the person has a structural subscription to the pillar,
@@ -540,7 +540,7 @@
             html = view.render()
             soup = BeautifulSoup(html)
         self.assertEqual(
-            u'Private', soup.find('label', text="Private"))
+            u'Private', soup.find('label', text="Private").string)
 
     def test_bugtask_view_user_with_grant_on_bug_for_private_product(self):
         # The regular bug view is properly rendered even if the user

=== modified file 'lib/lp/bugs/tests/test_bugs_webservice.py'
--- lib/lp/bugs/tests/test_bugs_webservice.py	2018-01-02 10:54:31 +0000
+++ lib/lp/bugs/tests/test_bugs_webservice.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2009-2012 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Webservice unit tests related to Launchpad Bugs."""
@@ -31,7 +31,7 @@
 from lp.bugs.interfaces.bug import IBug
 from lp.registry.enums import BugSharingPolicy
 from lp.registry.interfaces.product import License
-from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
 from lp.services.webapp import snapshot
 from lp.services.webapp.interfaces import OAuthPermission
 from lp.services.webapp.servers import LaunchpadTestRequest
@@ -108,7 +108,7 @@
     def findBugDescription(self, response):
         """Find the bug description field in an XHTML document fragment."""
         soup = BeautifulSoup(response.body)
-        dt = soup.find('dt', text="description").parent
+        dt = soup.find('dt', text="description")
         dd = dt.findNextSibling('dd')
         return str(dd.contents.pop())
 
@@ -121,7 +121,7 @@
         self.assertEqual(
             self.findBugDescription(response),
             u'<p>Useless bugs are useless. '
-            'See <a href="/bugs/%d" class="bug-link">Bug %d</a>.</p>' % (
+            'See <a class="bug-link" href="/bugs/%d">Bug %d</a>.</p>' % (
             self.bug_one.id, self.bug_one.id))
 
     def test_PATCH_xhtml_representation(self):
@@ -140,7 +140,7 @@
 
         self.assertEqual(
             self.findBugDescription(response),
-            u'<p>See <a href="/bugs/%d" class="bug-link">bug %d</a></p>' % (
+            u'<p>See <a class="bug-link" href="/bugs/%d">bug %d</a></p>' % (
             self.bug_one.id, self.bug_one.id))
 
 

=== modified file 'lib/lp/registry/stories/person/xx-person-rdf.txt'
--- lib/lp/registry/stories/person/xx-person-rdf.txt	2018-01-26 22:18:38 +0000
+++ lib/lp/registry/stories/person/xx-person-rdf.txt	2019-04-24 16:24:33 +0000
@@ -4,14 +4,14 @@
 We export FOAF RDF metadata from the /~Person.name/+index document.
 
     >>> from lp.services.beautifulsoup import (
-    ...     BeautifulSoup,
-    ...     SoupStrainer,
+    ...     BeautifulSoup4 as BeautifulSoup,
+    ...     SoupStrainer4 as SoupStrainer,
     ...     )
     >>> anon_browser.open("http://launchpad.dev/~name16";)
     >>> strainer = SoupStrainer(['link'], {'type': ['application/rdf+xml']})
-    >>> soup = BeautifulSoup(anon_browser.contents, parseOnlyThese=strainer)
+    >>> soup = BeautifulSoup(anon_browser.contents, parse_only=strainer)
     >>> print soup.renderContents()
-    <link rel="meta" type="application/rdf+xml" title="FOAF" href="+rdf" />
+    <link href="+rdf" rel="meta" title="FOAF" type="application/rdf+xml"/>
 
 
 Individual RDF
@@ -102,7 +102,7 @@
 
     >>> anon_browser.open("http://launchpad.dev/~carlos/+rdf";)
     >>> strainer = SoupStrainer(['foaf:name'])
-    >>> soup = BeautifulSoup(anon_browser.contents, parseOnlyThese=strainer)
+    >>> soup = BeautifulSoup(anon_browser.contents, parse_only=strainer)
     >>> for tag in soup:
     ...   tag.renderContents()
     'Carlos Perell\xc3\xb3 Mar\xc3\xadn'
@@ -112,7 +112,7 @@
 
     >>> anon_browser.open("http://launchpad.dev/~name21/+rdf";)
     >>> strainer = SoupStrainer(['foaf:member'])
-    >>> soup = BeautifulSoup(anon_browser.contents, parseOnlyThese=strainer)
+    >>> soup = BeautifulSoup(anon_browser.contents, parse_only=strainer)
     >>> len(soup)
     0
 

=== modified file 'lib/lp/registry/stories/team/xx-team-contactemail.txt'
--- lib/lp/registry/stories/team/xx-team-contactemail.txt	2017-10-21 18:14:14 +0000
+++ lib/lp/registry/stories/team/xx-team-contactemail.txt	2019-04-24 16:24:33 +0000
@@ -24,9 +24,9 @@
 A warning is rendered about the privacy implications of using a mailing list or
 external contact address.
 
-    >>> from lp.services.beautifulsoup import BeautifulSoup
+    >>> from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
     >>> soup = BeautifulSoup(browser.contents)
-    >>> soup.find(id='email-warning')
+    >>> print(soup.find(id='email-warning').decode())
     <p ... Email sent to a mailing list or external contact address may ...
 
 As we can see, the landscape-developers team has no contact address.

=== modified file 'lib/lp/services/beautifulsoup.py'
--- lib/lp/services/beautifulsoup.py	2017-10-21 18:14:14 +0000
+++ lib/lp/services/beautifulsoup.py	2019-04-24 16:24:33 +0000
@@ -1,10 +1,10 @@
-# Copyright 2017 Canonical Ltd.  This software is licensed under the
+# Copyright 2017-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Beautiful Soup wrapper for Launchpad.
 
 With Beautiful Soup 3, this is mostly for future migration convenience.
-With Beautiful Soup 4, it will do a little more work to avoid warnings.
+With Beautiful Soup 4, it does a little more work to avoid warnings.
 """
 
 from __future__ import absolute_import, print_function, unicode_literals
@@ -12,7 +12,9 @@
 __metaclass__ = type
 __all__ = [
     'BeautifulSoup',
+    'BeautifulSoup4',
     'SoupStrainer',
+    'SoupStrainer4',
     ]
 
 
@@ -20,6 +22,8 @@
     BeautifulSoup as _BeautifulSoup,
     SoupStrainer,
     )
+from bs4 import BeautifulSoup as _BeautifulSoup4
+from bs4.element import SoupStrainer as SoupStrainer4
 
 
 class BeautifulSoup(_BeautifulSoup):
@@ -28,3 +32,12 @@
         if not isinstance(markup, unicode) and "fromEncoding" not in kwargs:
             kwargs["fromEncoding"] = "UTF-8"
         super(BeautifulSoup, self).__init__(markup=markup, **kwargs)
+
+
+class BeautifulSoup4(_BeautifulSoup4):
+
+    def __init__(self, markup="", features="html.parser", **kwargs):
+        if not isinstance(markup, unicode) and "from_encoding" not in kwargs:
+            kwargs["from_encoding"] = "UTF-8"
+        super(BeautifulSoup4, self).__init__(
+            markup=markup, features=features, **kwargs)

=== modified file 'lib/lp/services/oauth/doc/oauth-pages.txt'
--- lib/lp/services/oauth/doc/oauth-pages.txt	2017-10-21 18:14:14 +0000
+++ lib/lp/services/oauth/doc/oauth-pages.txt	2019-04-24 16:24:33 +0000
@@ -26,15 +26,15 @@
     ...     return view, token
 
     >>> from lp.services.beautifulsoup import (
-    ...     BeautifulSoup,
-    ...     SoupStrainer,
+    ...     BeautifulSoup4 as BeautifulSoup,
+    ...     SoupStrainer4 as SoupStrainer,
     ...     )
     >>> def print_hidden_fields(html):
     ...     soup = BeautifulSoup(
-    ...         html, parseOnlyThese=SoupStrainer(attrs={'type': 'hidden'}))
+    ...         html, parse_only=SoupStrainer(attrs={'type': 'hidden'}))
     ...     for tag in soup.findAll(attrs={'type': 'hidden'}):
-    ...         if tag.attrMap['value']:
-    ...             print tag.attrMap['name'], tag.attrMap['value']
+    ...         if tag['value']:
+    ...             print tag['name'], tag['value']
 
 When the client doesn't specify a duration, the resulting request
 token will have no expiration date set.

=== modified file 'lib/lp/soyuz/browser/tests/test_archive_packages.py'
--- lib/lp/soyuz/browser/tests/test_archive_packages.py	2018-02-01 18:44:21 +0000
+++ lib/lp/soyuz/browser/tests/test_archive_packages.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2010-2018 Canonical Ltd.  This software is licensed under the
+# Copyright 2010-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Unit tests for TestP3APackages."""
@@ -25,7 +25,7 @@
 
 from lp.app.utilities.celebrities import ILaunchpadCelebrities
 from lp.registry.interfaces.pocket import PackagePublishingPocket
-from lp.services.beautifulsoup import BeautifulSoup
+from lp.services.beautifulsoup import BeautifulSoup4 as BeautifulSoup
 from lp.services.webapp import canonical_url
 from lp.services.webapp.authentication import LaunchpadPrincipal
 from lp.soyuz.browser.archive import ArchiveNavigationMenu
@@ -397,9 +397,10 @@
         self.assertEqual([],
             soup.findAll(
                 'div', attrs={'class': 'pending-job', 'job_id': jobs[-1].id}))
+        showing_tags = soup.find_all(
+            'span', text=re.compile('Showing 5 of .'))
         self.assertEqual(
-            ['Showing 5 of 7'],
-            soup.findAll('span', text=re.compile('Showing 5 of .')))
+            ['Showing 5 of 7'], [tag.string for tag in showing_tags])
 
     def test_job_notifications_display_owner_is_team(self):
         team = self.factory.makeTeam()

=== modified file 'lib/lp/testing/pages.py'
--- lib/lp/testing/pages.py	2018-12-10 13:54:34 +0000
+++ lib/lp/testing/pages.py	2019-04-24 16:24:33 +0000
@@ -1,4 +1,4 @@
-# Copyright 2009-2018 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2019 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Testing infrastructure for page tests."""
@@ -25,6 +25,15 @@
     ProcessingInstruction,
     Tag,
     )
+from bs4.element import (
+    Comment as Comment4,
+    Declaration as Declaration4,
+    Doctype as Doctype4,
+    NavigableString as NavigableString4,
+    PageElement as PageElement4,
+    ProcessingInstruction as ProcessingInstruction4,
+    Tag as Tag4,
+    )
 from contrib.oauth import (
     OAuthConsumer,
     OAuthRequest,
@@ -195,6 +204,8 @@
     """Find and return the tag with the given ID"""
     if isinstance(content, PageElement):
         elements_with_id = content.findAll(True, {'id': id})
+    elif isinstance(content, PageElement4):
+        elements_with_id = content.find_all(True, {'id': id})
     else:
         elements_with_id = [
             tag for tag in BeautifulSoup(
@@ -272,10 +283,10 @@
     return [extract_text(tag) for tag in soup]
 
 
-def print_feedback_messages(content):
+def print_feedback_messages(content, formatter='minimal'):
     """Print out the feedback messages."""
     for message in get_feedback_messages(content):
-        print extract_text(message)
+        print extract_text(message, formatter=formatter)
 
 
 def print_table(content, columns=None, skip_rows=None, sep="\t"):
@@ -337,7 +348,10 @@
     return label.replace('\xC2', '').replace('\xA0', '').strip()
 
 
-IGNORED_ELEMENTS = [Comment, Declaration, ProcessingInstruction]
+IGNORED_ELEMENTS = [
+    Comment, Declaration, ProcessingInstruction,
+    Comment4, Declaration4, Doctype4, ProcessingInstruction4,
+    ]
 ELEMENTS_INTRODUCING_NEWLINE = [
     'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre', 'dl',
     'div', 'noscript', 'blockquote', 'form', 'hr', 'table', 'fieldset',
@@ -348,7 +362,7 @@
 LEADING_AND_TRAILING_SPACES_RE = re.compile(
     u'(^[ \t]+)|([ \t]$)', re.MULTILINE)
 TABS_AND_SPACES_RE = re.compile(u'[ \t]+')
-NBSP_RE = re.compile(u'&nbsp;|&#160;')
+NBSP_RE = re.compile(u'&nbsp;|&#160;|\xa0')
 
 
 def extract_link_from_tag(tag, base=None):
@@ -357,7 +371,7 @@
     A `tag` should contain a 'href' attribute, and `base` will commonly
     be extracted from browser.url.
     """
-    if not isinstance(tag, PageElement):
+    if not isinstance(tag, (PageElement, PageElement4)):
         link = BeautifulSoup(tag)
     else:
         link = tag
@@ -369,7 +383,8 @@
         return urljoin(base, href)
 
 
-def extract_text(content, extract_image_text=False, skip_tags=None):
+def extract_text(content, extract_image_text=False, skip_tags=None,
+                 formatter='minimal'):
     """Return the text stripped of all tags.
 
     All runs of tabs and spaces are replaced by a single space and runs of
@@ -378,7 +393,7 @@
     """
     if skip_tags is None:
         skip_tags = ['script']
-    if not isinstance(content, PageElement):
+    if not isinstance(content, (PageElement, PageElement4)):
         soup = BeautifulSoup(content)
     else:
         soup = content
@@ -409,10 +424,15 @@
             result.append(unicode(node[:]))
         elif isinstance(node, NavigableString):
             result.append(unicode(node))
+        elif isinstance(node, NavigableString4):
+            result.append(node.format_string(node, formatter=formatter))
         else:
-            if isinstance(node, Tag):
+            if isinstance(node, (Tag, Tag4)):
                 # If the node has the class "sortkey" then it is invisible.
-                if node.get('class') == 'sortkey':
+                if isinstance(node, Tag) and node.get('class') == 'sortkey':
+                    continue
+                elif (isinstance(node, Tag4) and
+                        node.get('class') == ['sortkey']):
                     continue
                 elif getattr(node, 'name', '') in skip_tags:
                     continue
@@ -622,8 +642,12 @@
     else:
         for tab in location_apps:
             tab_text = extract_text(tab)
-            if tab['class'].find('active') != -1:
-                tab_text += ' (selected)'
+            if isinstance(tab['class'], list):  # BeautifulSoup 4
+                if 'active' in tab['class']:
+                    tab_text += ' (selected)'
+            else:                               # BeautifulSoup 3
+                if tab['class'].find('active') != -1:
+                    tab_text += ' (selected)'
             if tab.a:
                 link = tab.a['href']
             else:

=== modified file 'setup.py'
--- setup.py	2019-04-16 14:30:40 +0000
+++ setup.py	2019-04-24 16:24:33 +0000
@@ -148,6 +148,7 @@
         'auditorfixture',
         'backports.lzma',
         'BeautifulSoup',
+        'beautifulsoup4[lxml]',
         'bzr',
         'celery',
         'cssselect',

=== modified file 'utilities/snakefood/Makefile'
--- utilities/snakefood/Makefile	2011-12-29 05:29:36 +0000
+++ utilities/snakefood/Makefile	2019-04-24 16:24:33 +0000
@@ -8,7 +8,7 @@
 	-I $(LIB_DIR)/devscripts -I $(LIB_DIR)/contrib \
 	-I $(LIB_DIR)/canonical/not-used $(LIB_DIR)/canonical \
 	$(LIB_DIR)/lp 2>/dev/null | grep -v contrib/ \
-	| grep -v sqlobject | grep -v BeautifulSoup | grep -v psycopg \
+	| grep -v sqlobject | egrep -v 'BeautifulSoup|bs4' | grep -v psycopg \
 	| grep -v schoolbell | grep -v '/tests/' | grep -v '/ftests/' \
     | grep -v 'lp/services/config' > lp.sfood.tmp
 	mv lp.sfood.tmp lp.sfood

Follow ups

[Merge] lp:~cjwatson/launchpad/bs4-initial into lp:launchpad
From: noreply, 2019-04-25
Re: [Merge] lp:~cjwatson/launchpad/bs4-initial into lp:launchpad
From: Maximiliano Bertacchini, 2019-04-25