widelands-dev team mailing list archive
-
widelands-dev team
-
Mailing list archive
-
Message #15260
[Merge] lp:~widelands-dev/widelands-website/update_beautifulsoup4 into lp:widelands-website
kaputtnik has proposed merging lp:~widelands-dev/widelands-website/update_beautifulsoup4 into lp:widelands-website.
Commit message:
Update BeautifulSoup and make needed changes
Requested reviews:
Widelands Developers (widelands-dev)
For more details, see:
https://code.launchpad.net/~widelands-dev/widelands-website/update_beautifulsoup4/+merge/358571
Update BeautifulSoup3 to BeautifulSoup4. This is a prerequisite for the python update.
In contrary to bs3, bs4 escapes all (python) strings, so it is not possible anymore to apply just a (unicode)string like "<a href://example.com>LINKTEXT</a>". This results to "<a href://example.com>LINKTEXT</a>" for a BS4 object.
This branch takes care of this and modifies the used code to use BeautifulSoup4 objects. The new code may can be smarter, but i find it understandable.
I have also refactored some variables and comments.
The rendering times are close to equal in comparison with BeautifulSoup3. E.g.
For the Developers page: /developers/
bs3: ~0.62s
bs4: ~0.45s
For /wiki/WikiSyntax/
bs3: ~0.14s
bs4: ~0.14s
The Regular expression for finding pasted plain text-links is tested here: https://regexr.com/42pq5
I have also removed the SMILEY_PREESCAPING things, because it works as is right now. The only problem is: The 'develish' smiley won't work if it is placed as the first characters. I am in favor to replace '>:-)' with ']:-)' to fix this. Any remarks for this?
--
Your team Widelands Developers is requested to review the proposed merge of lp:~widelands-dev/widelands-website/update_beautifulsoup4 into lp:widelands-website.
=== modified file 'mainpage/templatetags/wl_markdown.py'
--- mainpage/templatetags/wl_markdown.py 2017-11-14 16:54:28 +0000
+++ mainpage/templatetags/wl_markdown.py 2018-11-09 18:07:36 +0000
@@ -25,7 +25,7 @@
import urllib
import bleach
-from BeautifulSoup import BeautifulSoup, NavigableString
+from bs4 import BeautifulSoup, NavigableString
# If we can import a Wiki module with Articles, we
# will check for internal wikipages links in all internal
@@ -38,8 +38,7 @@
# We will also need the site domain
from django.contrib.sites.models import Site
-from settings import SITE_ID, SMILEYS, SMILEY_DIR, \
- SMILEY_PREESCAPING
+from settings import SITE_ID, SMILEYS, SMILEY_DIR
try:
_domain = Site.objects.get(pk=SITE_ID).domain
@@ -60,42 +59,55 @@
def _insert_smileys(text):
"""This searches for smiley symbols in the current text and replaces them
with the correct images.
-
- Only replacing if smiley symbols aren't in a word (e.g. http://....)
-
- """
- words = text.split(' ')
- for sc, img in SMILEYS:
- if sc in words:
- words[words.index(
- sc)] = "<img src='%s%s' alt='%s' />" % (SMILEY_DIR, img, img)
- text = ' '.join(words)
- return text
-
-
-def _insert_smiley_preescaping(text):
- """This searches for smiley symbols in the current text and replaces them
- with the correct images."""
- for before, after in SMILEY_PREESCAPING:
- text = text.replace(before, after)
- return text
+
+ Then we have to reassemble the whole contents..."""
+
+ tmp_content = []
+ for content in text.parent.contents:
+ try:
+ # If this fails, content is probably '\n' or not a string, e.g. <br />
+ words = content.split(' ')
+ except:
+ # apply the unsplittable content and continue
+ tmp_content.append(content)
+ continue
+
+ for i, word in enumerate(words):
+ smiley = ""
+ for sc, img in SMILEYS:
+ if word == sc:
+ smiley = img
+ if smiley:
+ img_tag = BeautifulSoup(features="lxml").new_tag('img')
+ img_tag['src'] = "{}{}".format(SMILEY_DIR, smiley)
+ img_tag['alt'] = smiley
+ tmp_content.append(img_tag)
+ # Apply a space after the smiley
+ tmp_content.append(NavigableString(' '))
+ else:
+ if i < (len(words) - 1):
+ # Apply a space after each word, except the last word
+ word = word + ' '
+ tmp_content.append(NavigableString(word))
+
+ text.parent.contents = [x for x in tmp_content]
def _classify_link(tag):
- """Returns a classname to insert if this link is in any way special
+ """Applies a classname if this link is in any way special
(external or missing wikipages)
- tag to classify for
+ tag: classify for this tag
"""
# No class change for image links
- if tag.findChild('img') != None:
- return None
+ if tag.next_element.name == 'img':
+ return
try:
href = tag['href'].lower()
except KeyError:
- return None
+ return
# Check for external link
if href.startswith('http'):
@@ -105,67 +117,93 @@
external = False
break
if external:
- return {'class': 'externalLink', 'title': 'This link refers to outer space'}
+ tag['class'] = "externalLink"
+ tag['title'] = "This link refers to outer space"
+ return
if '/profile/' in (tag['href']):
- return {'class': 'userLink', 'title': 'This link refers to a userpage'}
+ tag['class'] = "userLink"
+ tag['title'] = "This link refers to a userpage"
+ return
if check_for_missing_wikipages and href.startswith('/wiki/'):
# Check for missing wikilink /wiki/PageName[/additionl/stuff]
# Using href because we need cAsEs here
- pn = urllib.unquote(tag['href'][6:].split('/', 1)[0])
+ article_name = urllib.unquote(tag['href'][6:].split('/', 1)[0])
- if not len(pn): # Wiki root link is not a page
- return {'class': 'wrongLink', 'title': 'This Link misses an articlename'}
+ if not len(article_name): # Wiki root link is not a page
+ tag['class'] = "wrongLink"
+ tag['title'] = "This Link misses an articlename"
+ return
# Wiki special pages are also not counted
- if pn in ['list', 'search', 'history', 'feeds', 'observe', 'edit']:
- return {'class': 'specialLink'}
+ if article_name in ['list', 'search', 'history', 'feeds', 'observe', 'edit']:
+ tag['class'] = "specialLink"
+ return
# Check for a redirect
try:
# try to get the article id; if this fails an IndexError is raised
a_id = ChangeSet.objects.filter(
- old_title=pn).values_list('article_id')[0]
+ old_title=article_name).values_list('article_id')[0]
# get actual title of article
act_t = Article.objects.get(id=a_id[0]).title
- if pn != act_t:
- return {'title': "This is a redirect and points to \"" + act_t + "\""}
+ if article_name != act_t:
+ tag['title'] = "This is a redirect and points to \"" + act_t + "\""
+ return
else:
- return None
+ return
except IndexError:
pass
# article missing (or misspelled)
- if Article.objects.filter(title=pn).count() == 0:
- return {'class': 'missingLink', 'title': 'This Link is misspelled or missing. Click to create it anyway.'}
-
- return None
-
-
-def _clickable_image(tag):
+ if Article.objects.filter(title=article_name).count() == 0:
+ tag['class'] = "missingLink"
+ tag['title'] = "This Link is misspelled or missing. Click to create it anyway."
+ return
+ return
+
+
+def _make_clickable_images(tag):
# is external link?
if tag['src'].startswith('http'):
- # is allways a link?
+ # Do not change if it is allready a link
if tag.parent.name != 'a':
# add link to image
- text = '<a href=' + tag['src'] + \
- '><img src=' + tag['src'] + '></a>'
- return text
- return None
-
+ new_link = BeautifulSoup(features="lxml").new_tag('a')
+ new_link['href'] = tag['src']
+ new_img = BeautifulSoup(features="lxml").new_tag('img')
+ new_img['src'] = tag['src']
+ new_img['alt'] = tag['alt']
+ new_link.append(new_img)
+ tag.replace_with(new_link)
+ return
+
+
+def find_smiley_Strings(bs4_string):
+ """Find strings that contain a smiley symbol"""
+
+ if bs4_string.parent.name.lower() == 'code':
+ return False
+
+ #for element in bs4_string.parent.contents:
+ for sc in SMILEYS:
+ if sc[0] in bs4_string:
+ return True
+ return False
# Predefine the markdown extensions here to have a clean code in
# do_wl_markdown()
md_extensions = ['extra', 'toc', SemanticWikiLinkExtension()]
def do_wl_markdown(value, *args, **keyw):
- # Do Preescaping for markdown, so that some things stay intact
- # This is currently only needed for this smiley ">:-)"
- value = _insert_smiley_preescaping(value)
- custom = keyw.pop('custom', True)
+ """Apply wl specific things, like smileys or colored links.
+
+ If something get modified, it is mostky done directly in the subfunctions"""
+
+ beautify = keyw.pop('beautify', True)
html = smart_str(markdown(value, extensions=md_extensions))
# Sanitize posts from potencial untrusted users (Forum/Wiki/Maps)
@@ -173,49 +211,29 @@
html = mark_safe(bleach.clean(
html, tags=BLEACH_ALLOWED_TAGS, attributes=BLEACH_ALLOWED_ATTRIBUTES))
- # Since we only want to do replacements outside of tags (in general) and not between
- # <a> and </a> we partition our site accordingly
- # BeautifoulSoup does all the heavy lifting
- soup = BeautifulSoup(html)
+ # Prepare the html and apply smileys and classes.
+ # BeautifulSoup objects are all references, so changing a variable
+ # derived from the soup will take effect on the soup itself.
+ # Because of that the called functions will modify the soup directly.
+ soup = BeautifulSoup(html, features="lxml")
if len(soup.contents) == 0:
# well, empty soup. Return it
return unicode(soup)
- for text in soup.findAll(text=True):
- # Do not replace inside a link
- if text.parent.name == 'a':
- continue
-
- # We do our own small preprocessing of the stuff we got, after markdown
- # went over it General consensus is to avoid replacing anything in
- # links [blah](blkf)
- if custom:
- rv = text
- # Replace smileys; only outside "code-tags"
- if not text.parent.name == 'code':
- rv = _insert_smileys(rv)
-
- text.replaceWith(rv)
-
- # This call slows the whole function down...
- # unicode->reparsing.
- # The function goes from .5 ms to 1.5ms on my system
- # Well, for our site with it's little traffic it's maybe not so important...
- # What a waste of cycles :(
- soup = BeautifulSoup(unicode(soup))
- # We have to go over this to classify links
- for tag in soup.findAll('a'):
- rv = _classify_link(tag)
- if rv:
- for attribute in rv.iterkeys():
- tag[attribute] = rv.get(attribute)
-
- # All external images gets clickable
- # This applies only in forum
- for tag in soup.findAll('img'):
- link = _clickable_image(tag)
- if link:
- tag.replaceWith(link)
+ if beautify:
+ # Insert smileys
+ smiley_text = soup.find_all(string=find_smiley_Strings)
+ for text in smiley_text:
+ _insert_smileys(text)
+
+ # Classify links
+ for tag in soup.find_all('a'):
+ _classify_link(tag)
+
+ # All external images gets clickable
+ # This applies only in forum
+ for tag in soup.find_all('img'):
+ _make_clickable_images(tag)
return unicode(soup)
=== modified file 'mainpage/views.py'
--- mainpage/views.py 2018-04-03 05:18:03 +0000
+++ mainpage/views.py 2018-11-09 18:07:36 +0000
@@ -126,7 +126,7 @@
except IOError:
txt = txt + "Couldn't find developer file!"
- txt = do_wl_markdown(txt, custom=False)
+ txt = do_wl_markdown(txt, beautify=False)
return render(request, 'mainpage/developers.html',
{'developers': txt}
=== modified file 'pip_requirements.txt'
--- pip_requirements.txt 2018-10-03 11:03:43 +0000
+++ pip_requirements.txt 2018-11-09 18:07:36 +0000
@@ -1,6 +1,6 @@
# Python requirements for widelands-website at 22.06.2017
-BeautifulSoup==3.2.0
+beautifulsoup4==4.6.3
Django==1.11.12
django-haystack==2.8.1
# django-messages is very old on pypi
@@ -11,6 +11,7 @@
django-registration==2.4.1
django-tagging==0.4.5
gunicorn==19.7.1
+lxml==4.2.5
Markdown==2.6.8
mysqlclient==1.3.10
numpy==1.13.0
=== modified file 'pybb/util.py'
--- pybb/util.py 2018-10-01 16:41:29 +0000
+++ pybb/util.py 2018-11-09 18:07:36 +0000
@@ -2,8 +2,9 @@
import random
import traceback
import json
+import re
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
from django.shortcuts import render
from django.http import HttpResponse
@@ -11,7 +12,6 @@
from django.utils.translation import check_for_language
from django.utils.encoding import force_unicode
from django import forms
-from django.template.defaultfilters import urlize as django_urlize
from django.core.paginator import Paginator, EmptyPage, InvalidPage
from django.conf import settings
from pybb import settings as pybb_settings
@@ -145,6 +145,16 @@
return form
+PLAIN_LINK_RE = re.compile(r'(http[s]?:\/\/[-a-zA-Z0-9@:%._\+~#=/?]+)')
+def exclude_code_tag(bs4_string):
+ if bs4_string.parent.name == 'code':
+ return False
+ m = PLAIN_LINK_RE.search(bs4_string)
+ if m:
+ return True
+ return False
+
+
def urlize(data):
"""Urlize plain text links in the HTML contents.
@@ -152,18 +162,29 @@
"""
- soup = BeautifulSoup(data)
- for chunk in soup.findAll(text=True):
- islink = False
- ptr = chunk.parent
- while ptr.parent:
- if ptr.name == 'a' or ptr.name == 'code':
- islink = True
- break
- ptr = ptr.parent
- if not islink:
- # Using unescape to prevent conversation of f.e. > to &gt;
- chunk = chunk.replaceWith(django_urlize(unicode(unescape(chunk))))
+ soup = BeautifulSoup(data, 'lxml')
+ for found_string in soup.find_all(string=exclude_code_tag):
+ new_content = []
+ strings_or_tags = found_string.parent.contents
+ for string_or_tag in strings_or_tags:
+ try:
+ for string in PLAIN_LINK_RE.split(string_or_tag):
+ if string.startswith('http'):
+ # Apply an a-Tag
+ tag = soup.new_tag('a')
+ tag['href'] = string
+ tag.string = string
+ tag['nofollow'] = 'true'
+ new_content.append(tag)
+ else:
+ # This is just a string, apply a bs4-string
+ new_content.append(NavigableString(string))
+ except:
+ # Regex failed, so apply what ever it is
+ new_content.append(string_or_tag)
+
+ # Apply the new content
+ found_string.parent.contents = new_content
return unicode(soup)
=== modified file 'settings.py'
--- settings.py 2018-10-05 19:10:18 +0000
+++ settings.py 2018-11-09 18:07:36 +0000
@@ -213,8 +213,7 @@
(':))', 'face-smile-big.png'),
(':-)', 'face-smile.png'),
(':)', 'face-smile.png'),
- # Hack around markdown replacement. see also SMILEY_PREESCAPING
- ('>:-)', 'face-devilish.png'),
+ ('>:-)', 'face-devilish.png'),
('8-)', 'face-glasses.png'),
('8)', 'face-glasses.png'),
(':-D', 'face-grin.png'),
@@ -243,10 +242,6 @@
(';-)', 'face-wink.png'),
(';)', 'face-wink.png'),
]
-# This needs to be done to keep some stuff hidden from markdown
-SMILEY_PREESCAPING = [
- ('>:-)', '\>:-)')
-]
#################
# Search Config #
Follow ups