Files
@ 2c26ede197a0
Branch filter:
Location: website/www/conservancy/bsoup.py - annotation
2c26ede197a0
6.4 KiB
text/x-python
supporter: Add anchors.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 3b2ed8397d4e 703df9c8e97f 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 3b2ed8397d4e 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f 703df9c8e97f | # -*- encoding: utf-8 -*-
import io
import itertools
import re
import bs4
import bs4.element
class BeautifulSoup(bs4.BeautifulSoup):
"""A wrapper of the original BeautifulSoup class, with convenience methods added."""
IMAGE_ATTRS = {
'img': 'src',
'video': 'poster',
}
NON_BODY_TEXT_TAGS = frozenset([
'img',
'video',
])
SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
def __init__(self, src, parser='html5lib'):
# WARNING! It seems like it would be ideal to use the 'lxml' parser
# for speed, but that doesn't work in our web application. On
# Debian stretch, at least, using lxml causes the web server WSGI
# application to go into an infinite loop.
super(BeautifulSoup, self).__init__(src, parser)
def _body_text(self, root):
# "Body text" is all the strings under the root element, in order,
# except:
# * strings inside NON_BODY_TEXT_TAGS
# * strings inside containers of NON_BODY_TEXT_TAGS. A container is
# an element that has a NON_BODY_TEXT_TAGS element as its first child.
# For example, in <div> <video …> … </div>, none of the div's strings
# are included in the body text, because it's considered to be a
# <video> container, and any strings are probably a caption, fallback
# text, or other non-body text.
started = False
for child in root.children:
child_type = type(child)
if issubclass(child_type, bs4.element.Tag):
if child.name in self.NON_BODY_TEXT_TAGS:
if not started:
break
else:
for s in self._body_text(child):
yield s
# It's not worth it to use issubclass here, because elements that
# don't have body text like Comments and CDATA are subclasses of
# NavigableString.
elif child_type is bs4.element.NavigableString:
if started:
yield child
elif child.isspace():
pass
else:
yield child
started = True
def body_text(self):
"""Return an iterator of strings comprising this document's body text."""
return self._body_text(self)
def some_body_text(self, char_target=300):
"""Return an iterator of strings with some of this document's body text.
This is the same as body_text, except after it yields a string that
looks like the end of a sentence, it checks whether it has yielded
at least `char_target` characters. If so, the iterator stops.
"""
# This implementation is likely to overshoot `char_target` a lot,
# because it doesn't look inside the strings it yields, just at the
# end of them. We can implement something smarter later if needed.
char_count = 0
for s in self.body_text():
yield s
char_count += len(s)
if (char_count > char_target) and self.SENTENCE_END.search(s):
break
@staticmethod
def is_video_source(elem):
try:
return elem.name == 'source' and elem.parent.name == 'video'
except AttributeError:
return False
def iter_images(self):
"""Return an iterator of all image elements in this document.
Images include <img> and <video> with a poster attribute.
"""
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
try:
elem[self.IMAGE_ATTRS[elem.name]]
except KeyError:
pass
else:
yield elem
def iter_videos(self):
"""Return an iterator of all video source elements in this document."""
return self.find_all(self.is_video_source, src=True)
class SoupModelMixin:
"""Mixin for models to parse HTML with BeautifulSoup.
Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
that name attributes with HTML in them. After that, all the public methods
are usable.
"""
OG_PREVIEW_ATTR = 'data-ogpreview'
SOUP_ATTRS = []
def _get_soup(self):
try:
return self._soup
except AttributeError:
html = io.StringIO()
for attr_name in self.SOUP_ATTRS:
html.write(getattr(self, attr_name))
html.seek(0)
self._soup = BeautifulSoup(html)
return self._soup
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
def elem_sort_key(elem):
try:
sort_key = getvalue(elem[attr_name])
except (KeyError, ValueError):
sort_key = fallback
elem[attr_name] = sort_key
return sort_key
return elem_sort_key
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
def elem_pred(elem):
return test(elem[attr_name])
return elem_pred
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
if slice_args:
return itertools.islice(seq, *slice_args)
else:
return seq
def get_description(self):
"""Return a string with a brief excerpt of body text from the HTML."""
return u''.join(self._get_soup().some_body_text())
def get_image_urls(self, *slice_args):
"""Return an iterator of source URL strings of all images in the HTML.
Images include <img> sources and <video> poster attributes.
"""
for elem in self._sort_and_slice_elems(
self._get_soup().iter_images(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
def get_one_image_url(self):
return self.get_image_urls(1)
def get_video_urls(self, *slice_args):
"""Return an iterator of source URL strings of all videos in the HTML."""
for elem in self._sort_and_slice_elems(
self._get_soup().iter_videos(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem['src']
def get_one_video_url(self):
return self.get_video_urls(1)
|