# -*- encoding: utf-8 -*- import io import itertools import re import bs4 import bs4.element class BeautifulSoup(bs4.BeautifulSoup): """A wrapper of the original BeautifulSoup class, with convenience methods added.""" IMAGE_ATTRS = { 'img': 'src', 'video': 'poster', } NON_BODY_TEXT_TAGS = frozenset([ 'img', 'video', ]) SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$') def __init__(self, src, parser='html5lib'): # WARNING! It seems like it would be ideal to use the 'lxml' parser # for speed, but that doesn't work in our web application. On # Debian stretch, at least, using lxml causes the web server WSGI # application to go into an infinite loop. super(BeautifulSoup, self).__init__(src, parser) def _body_text(self, root): # "Body text" is all the strings under the root element, in order, # except: # * strings inside NON_BODY_TEXT_TAGS # * strings inside containers of NON_BODY_TEXT_TAGS. A container is # an element that has a NON_BODY_TEXT_TAGS element as its first child. # For example, in
, none of the div's strings # are included in the body text, because it's considered to be a #