openlp-core team mailing list archive
-
openlp-core team
-
Mailing list archive
-
Message #20490
[Merge] lp:~googol/openlp/bs4 into lp:openlp
Andreas Preikschat has proposed merging lp:~googol/openlp/bs4 into lp:openlp.
Requested reviews:
OpenLP Core (openlp-core)
For more details, see:
https://code.launchpad.net/~googol/openlp/bs4/+merge/159468
Hello,
- ported code to beautifulsoup 4
- moved regular expressions to the top
- fixed regression
Function tests worked fine.
I could not test the interface tests because bs4 is not installed on the server and the interface tests fails here (SegFault).
--
https://code.launchpad.net/~googol/openlp/bs4/+merge/159468
Your team OpenLP Core is requested to review the proposed merge of lp:~googol/openlp/bs4 into lp:openlp.
=== modified file 'openlp/core/ui/exceptionform.py'
--- openlp/core/ui/exceptionform.py 2013-04-02 20:52:31 +0000
+++ openlp/core/ui/exceptionform.py 2013-04-17 18:37:26 +0000
@@ -35,7 +35,7 @@
import platform
import sqlalchemy
-import BeautifulSoup
+from bs4 import BeautifulSoup
from lxml import etree
from PyQt4 import Qt, QtCore, QtGui, QtWebKit
=== modified file 'openlp/plugins/bibles/lib/http.py'
--- openlp/plugins/bibles/lib/http.py 2013-03-07 08:05:43 +0000
+++ openlp/plugins/bibles/lib/http.py 2013-04-17 18:37:26 +0000
@@ -36,7 +36,7 @@
import urllib
from HTMLParser import HTMLParseError
-from BeautifulSoup import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
from openlp.core.lib import Registry, translate
from openlp.core.lib.ui import critical_error_message_box
@@ -44,6 +44,9 @@
from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
+CLEANER_REGEX = re.compile(r' |<br />|\'\+\'')
+FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
+REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
UGLY_CHARS = {
u'\u2014': u' - ',
u'\u2018': u'\'',
@@ -52,9 +55,12 @@
u'\u201d': u'"',
u' ': u' '
}
+VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
+
log = logging.getLogger(__name__)
+
class BGExtract(object):
"""
Extract verses from BibleGateway
@@ -78,9 +84,9 @@
An HTML class attribute for further qualification.
"""
if class_:
- all_tags = parent.findAll(tag, class_)
+ all_tags = parent.find_all(tag, class_)
else:
- all_tags = parent.findAll(tag)
+ all_tags = parent.find_all(tag)
for element in all_tags:
element.extract()
@@ -93,14 +99,15 @@
"""
if isinstance(tag, NavigableString):
return None, unicode(tag)
- elif tag.get('class') == 'versenum' or tag.get('class') == 'versenum mid-line':
+ elif tag.get('class')[0] == "versenum" or tag.get('class')[0] == 'versenum mid-line':
verse = unicode(tag.string).replace('[', '').replace(']', '').strip()
return verse, None
- elif tag.get('class') == 'chapternum':
+ elif tag.get('class')[0] == 'chapternum':
verse = '1'
return verse, None
else:
- verse, text = None, ''
+ verse = None
+ text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
@@ -137,7 +144,8 @@
tags = tags[::-1]
current_text = ''
for tag in tags:
- verse, text = None, ''
+ verse = None
+ text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
@@ -173,8 +181,8 @@
def _extract_verses_old(self, div):
"""
- Use the old style of parsing for those Bibles on BG who mysteriously
- have not been migrated to the new (still broken) HTML.
+ Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
+ broken) HTML.
``div``
The parent div.
@@ -185,13 +193,12 @@
if first_verse and first_verse.contents:
verse_list[1] = unicode(first_verse.contents[0])
for verse in div(u'sup', u'versenum'):
- raw_verse_num = verse.next
+ raw_verse_num = verse.next_element
clean_verse_num = 0
- # Not all verses exist in all translations and may or may not be
- # represented by a verse number. If they are not fine, if they are
- # it will probably be in a format that breaks int(). We will then
- # have no idea what garbage may be sucked in to the verse text so
- # if we do not get a clean int() then ignore the verse completely.
+ # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
+ # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
+ # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
+ # completely.
try:
clean_verse_num = int(str(raw_verse_num))
except ValueError:
@@ -201,16 +208,16 @@
except TypeError:
log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
if clean_verse_num:
- verse_text = raw_verse_num.next
- part = raw_verse_num.next.next
- while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'):
+ verse_text = raw_verse_num.next_element
+ part = raw_verse_num.next_element.next_element
+ while not (isinstance(part, Tag) and part.get(u'class')[0] == u'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text += part
- if isinstance(part.next, Tag) and part.next.name == u'div':
+ if isinstance(part.next_element, Tag) and part.next_element.name == u'div':
# Run out of verses so stop.
break
- part = part.next
+ part = part.next_element
verse_list[clean_verse_num] = unicode(verse_text)
return verse_list
@@ -230,15 +237,14 @@
log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter)
url_book_name = urllib.quote(book_name.encode("utf-8"))
url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version)
- cleaner = [(re.compile(' |<br />|\'\+\''), lambda match: '')]
soup = get_soup_for_bible_ref(
u'http://www.biblegateway.com/passage/?%s' % url_params,
- pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='', cleaner=cleaner)
+ pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup:
return None
div = soup.find('div', 'result-text-style-normal')
self._clean_soup(div)
- span_list = div.findAll('span', 'text')
+ span_list = div.find_all('span', 'text')
log.debug('Span list: %s', span_list)
if not span_list:
# If we don't get any spans then we must have the old HTML format
@@ -282,7 +288,7 @@
self.application.process_events()
content = soup.find(u'table', u'infotable')
if content:
- content = content.findAll(u'tr')
+ content = content.find_all(u'tr')
if not content:
log.error(u'No books found in the Biblegateway response.')
send_error_message(u'parse')
@@ -341,19 +347,17 @@
log.error(u'No verses found in the Bibleserver response.')
send_error_message(u'parse')
return None
- content = content.find(u'div').findAll(u'div')
- verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
+ content = content.find(u'div').find_all(u'div')
verses = {}
for verse in content:
self.application.process_events()
- versenumber = int(verse_number.sub(r'\3', verse[u'class']))
+ versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', u' '.join(verse[u'class'])))
verses[versenumber] = verse.contents[1].rstrip(u'\n')
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
- Load a list of all books a Bible contains from Bibleserver mobile
- website.
+ Load a list of all books a Bible contains from Bibleserver mobile website.
``version``
The version of the Bible like NIV for New International Version
@@ -369,9 +373,19 @@
log.error(u'No books found in the Bibleserver response.')
send_error_message(u'parse')
return None
- content = content.findAll(u'li')
+ content = content.find_all(u'li')
return [book.contents[0].contents[0] for book in content]
+ def _get_application(self):
+ """
+ Adds the openlp to the class dynamically
+ """
+ if not hasattr(self, u'_application'):
+ self._application = Registry().get(u'application')
+ return self._application
+
+ application = property(_get_application)
+
class CWExtract(object):
"""
@@ -404,14 +418,12 @@
if not soup:
return None
self.application.process_events()
- html_verses = soup.findAll(u'span', u'versetext')
+ html_verses = soup.find_all(u'span', u'versetext')
if not html_verses:
log.error(u'No verses found in the CrossWalk response.')
send_error_message(u'parse')
return None
verses = {}
- reduce_spaces = re.compile(r'[ ]{2,}')
- fix_punctuation = re.compile(r'[ ]+([.,;])')
for verse in html_verses:
self.application.process_events()
verse_number = int(verse.contents[0].contents[0])
@@ -432,11 +444,10 @@
if isinstance(subsub, NavigableString):
verse_text += subsub
self.application.process_events()
- # Fix up leading and trailing spaces, multiple spaces, and spaces
- # between text and , and .
+ # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip(u'\n\r\t ')
- verse_text = reduce_spaces.sub(u' ', verse_text)
- verse_text = fix_punctuation.sub(r'\1', verse_text)
+ verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text)
+ verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
verses[verse_number] = verse_text
return SearchResults(book_name, chapter, verses)
@@ -458,7 +469,7 @@
log.error(u'No books found in the Crosswalk response.')
send_error_message(u'parse')
return None
- content = content.findAll(u'li')
+ content = content.find_all(u'li')
books = []
for book in content:
book = book.find(u'a')
@@ -481,9 +492,8 @@
def __init__(self, parent, **kwargs):
"""
- Finds all the bibles defined for the system
- Creates an Interface Object for each bible containing connection
- information
+ Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection
+ information.
Throws Exception if no Bibles are found.
@@ -492,8 +502,7 @@
BibleDB.__init__(self, parent, **kwargs)
self.download_source = kwargs[u'download_source']
self.download_name = kwargs[u'download_name']
- # TODO: Clean up proxy stuff. We probably want one global proxy per
- # connection type (HTTP and HTTPS) at most.
+ # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most.
self.proxy_server = None
self.proxy_username = None
self.proxy_password = None
@@ -508,8 +517,8 @@
def do_import(self, bible_name=None):
"""
- Run the import. This method overrides the parent class method. Returns
- ``True`` on success, ``False`` on failure.
+ Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on
+ failure.
"""
self.wizard.progress_bar.setMaximum(68)
self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...'))
@@ -549,8 +558,7 @@
if self.stop_import_flag:
break
self.wizard.increment_progress_bar(translate(
- 'BiblesPlugin.HTTPBible', 'Importing %s...',
- 'Importing <book name>...') % book)
+ 'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing <book name>...') % book)
book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id)
if not book_ref_id:
log.exception(u'Importing books from %s - download name: "%s" '\
@@ -567,22 +575,19 @@
def get_verses(self, reference_list, show_error=True):
"""
- A reimplementation of the ``BibleDB.get_verses`` method, this one is
- specifically for web Bibles. It first checks to see if the particular
- chapter exists in the DB, and if not it pulls it from the web. If the
- chapter DOES exist, it simply pulls the verses from the DB using the
- ancestor method.
+ A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first
+ checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter
+ DOES exist, it simply pulls the verses from the DB using the ancestor method.
``reference_list``
- This is the list of references the media manager item wants. It is
- a list of tuples, with the following format::
+ This is the list of references the media manager item wants. It is a list of tuples, with the following
+ format::
(book_reference_id, chapter, start_verse, end_verse)
- Therefore, when you are looking for multiple items, simply break
- them up into references like this, bundle them into a list. This
- function then runs through the list, and returns an amalgamated
- list of ``Verse`` objects. For example::
+ Therefore, when you are looking for multiple items, simply break them up into references like this, bundle
+ them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse``
+ objects. For example::
[(u'35', 1, 1, 1), (u'35', 2, 2, 3)]
"""
@@ -643,7 +648,7 @@
Return the number of chapters in a particular book.
``book``
- The book object to get the chapter count for.
+ The book object to get the chapter count for.
"""
log.debug(u'HTTPBible.get_chapter_count("%s")', book.name)
return BiblesResourcesDB.get_chapter_count(book.book_reference_id)
@@ -671,8 +676,7 @@
application = property(_get_application)
-def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
- pre_parse_substitute=None, cleaner=None):
+def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre_parse_substitute=None):
"""
Gets a webpage and returns a parsed and optionally cleaned soup or None.
@@ -683,14 +687,11 @@
An optional HTTP header to pass to the bible web server.
``pre_parse_regex``
- A regular expression to run on the webpage. Allows manipulation of the
- webpage before passing to BeautifulSoup for parsing.
+ A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup
+ for parsing.
``pre_parse_substitute``
The text to replace any matches to the regular expression with.
-
- ``cleaner``
- An optional regex to use during webpage parsing.
"""
if not reference_url:
return None
@@ -703,10 +704,8 @@
page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source)
soup = None
try:
- if cleaner:
- soup = BeautifulSoup(page_source, markupMassage=cleaner)
- else:
- soup = BeautifulSoup(page_source)
+ soup = BeautifulSoup(page_source)
+ CLEANER_REGEX.sub(u'', unicode(soup))
except HTMLParseError:
log.exception(u'BeautifulSoup could not parse the bible page.')
if not soup:
=== modified file 'openlp/plugins/custom/forms/editcustomform.py'
--- openlp/plugins/custom/forms/editcustomform.py 2013-03-28 21:49:11 +0000
+++ openlp/plugins/custom/forms/editcustomform.py 2013-04-17 18:37:26 +0000
@@ -257,6 +257,6 @@
# We must have at least one slide.
if self.slide_list_view.count() == 0:
critical_error_message_box(message=translate('CustomPlugin.EditCustomForm',
- 'You need to add at least one slide'))
+ 'You need to add at least one slide.'))
return False
return True
=== modified file 'scripts/check_dependencies.py'
--- scripts/check_dependencies.py 2013-03-30 21:54:42 +0000
+++ scripts/check_dependencies.py 2013-04-17 18:37:26 +0000
@@ -79,7 +79,7 @@
'lxml',
'chardet',
'enchant',
- 'BeautifulSoup',
+ 'bs4',
'mako',
'migrate',
'uno',
=== modified file 'tests/interfaces/openlp_plugins/custom/forms/test_customform.py'
--- tests/interfaces/openlp_plugins/custom/forms/test_customform.py 2013-03-30 09:01:21 +0000
+++ tests/interfaces/openlp_plugins/custom/forms/test_customform.py 2013-04-17 18:37:26 +0000
@@ -62,3 +62,45 @@
QtTest.QTest.mouseClick(self.form.add_button, QtCore.Qt.LeftButton)
#THEN: One slide should be added.
assert self.form.slide_list_view.count() == 1, u'There should be one slide added.'
+
+ def validate_not_valid_part1_test(self):
+ """
+ Test the _validate() method.
+ """
+ # GIVEN: Mocked methods.
+ with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
+ mocked_critical_error_message_box:
+ mocked_displayText = MagicMock()
+ mocked_displayText.return_value = u''
+ self.form.title_edit.displayText = mocked_displayText
+ mocked_setFocus = MagicMock()
+ self.form.title_edit.setFocus = mocked_setFocus
+
+ # WHEN: Call the method.
+ result = self.form._validate()
+
+ # THEN: The validate method should have returned False.
+ assert not result, u'The _validate() method should have retured False'
+ mocked_setFocus.assert_called_with()
+ mocked_critical_error_message_box.assert_called_with(message=u'You need to type in a title.')
+
+ def validate_not_valid_part2_test(self):
+ """
+ Test the _validate() method.
+ """
+ # GIVEN: Mocked methods.
+ with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
+ mocked_critical_error_message_box:
+ mocked_displayText = MagicMock()
+ mocked_displayText.return_value = u'something'
+ self.form.title_edit.displayText = mocked_displayText
+ mocked_count = MagicMock()
+ mocked_count.return_value = 0
+ self.form.slide_list_view.count = mocked_count
+
+ # WHEN: Call the method.
+ result = self.form._validate()
+
+ # THEN: The validate method should have returned False.
+ assert not result, u'The _validate() method should have retured False'
+ mocked_critical_error_message_box.assert_called_with(message=u'You need to add at least one slide.')
Follow ups