diff --git a/www/conservancy/bsoup.py b/www/conservancy/bsoup.py new file mode 100644 index 0000000000000000000000000000000000000000..fb0ef6cb3d2ad50322ecc1a17f4212f713727a91 --- /dev/null +++ b/www/conservancy/bsoup.py @@ -0,0 +1,144 @@ +# -*- encoding: utf-8 -*- + +import io +import re + +import bs4 +import bs4.element + +class BeautifulSoup(bs4.BeautifulSoup): + """A wrapper of the original BeautifulSoup class, with convenience methods added.""" + + IMAGE_ATTRS = { + 'img': 'src', + 'video': 'poster', + } + NON_BODY_TEXT_TAGS = frozenset([ + 'img', + 'video', + ]) + SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$') + + def __init__(self, src, parser='html5lib'): + # WARNING! It seems like it would be ideal to use the 'lxml' parser + # for speed, but that doesn't work in our web application. On + # Debian stretch, at least, using lxml causes the web server WSGI + # application to go into an infinite loop. + super(BeautifulSoup, self).__init__(src, parser) + + def _body_text(self, root): + # "Body text" is all the strings under the root element, in order, + # except: + # * strings inside NON_BODY_TEXT_TAGS + # * strings inside containers of NON_BODY_TEXT_TAGS. A container is + # an element that has a NON_BODY_TEXT_TAGS element as its first child. + # For example, in
, none of the div's strings + # are included in the body text, because it's considered to be a + #