diff --git a/www/conservancy/bsoup.py b/www/conservancy/bsoup.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0ef6cb3d2ad50322ecc1a17f4212f713727a91
--- /dev/null
+++ b/www/conservancy/bsoup.py
@@ -0,0 +1,144 @@
+# -*- encoding: utf-8 -*-
+
+import io
+import re
+
+import bs4
+import bs4.element
+
+class BeautifulSoup(bs4.BeautifulSoup):
+ """A wrapper of the original BeautifulSoup class, with convenience methods added."""
+
+ IMAGE_ATTRS = {
+ 'img': 'src',
+ 'video': 'poster',
+ }
+ NON_BODY_TEXT_TAGS = frozenset([
+ 'img',
+ 'video',
+ ])
+ SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
+
+ def __init__(self, src, parser='html5lib'):
+ # WARNING! It seems like it would be ideal to use the 'lxml' parser
+ # for speed, but that doesn't work in our web application. On
+ # Debian stretch, at least, using lxml causes the web server WSGI
+ # application to go into an infinite loop.
+ super(BeautifulSoup, self).__init__(src, parser)
+
+ def _body_text(self, root):
+ # "Body text" is all the strings under the root element, in order,
+ # except:
+ # * strings inside NON_BODY_TEXT_TAGS
+ # * strings inside containers of NON_BODY_TEXT_TAGS. A container is
+ # an element that has a NON_BODY_TEXT_TAGS element as its first child.
+ # For example, in
, none of the div's strings
+ # are included in the body text, because it's considered to be a
+ #