# -*- encoding: utf-8 -*-
import io
import itertools
import re
import bs4
@@ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup):
except AttributeError:
return False
def iter_attr(self, tag, attr_name, **kwargs):
kwargs[attr_name] = True
for elem in self.find_all(tag, **kwargs):
yield elem[attr_name]
def iter_images(self):
"""Return an iterator of all image elements in this document.
def iter_image_urls(self):
"""Return an iterator of source URL strings of all images in this document.
Images include <img> tags and <video> poster attributes.
Images include <img> and <video> with a poster attribute.
"""
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
try:
yield elem[self.IMAGE_ATTRS[elem.name]]
elem[self.IMAGE_ATTRS[elem.name]]
except KeyError:
pass
else:
yield elem
def iter_video_urls(self):
"""Return an iterator of source URL strings of all videos in this document."""
return self.iter_attr(self.is_video_source, 'src')
def iter_videos(self):
"""Return an iterator of all video source elements in this document."""
return self.find_all(self.is_video_source, src=True)
class SoupModelMixin:
@@ -115,6 +113,7 @@ class SoupModelMixin:
are usable.
OG_PREVIEW_ATTR = 'data-ogpreview'
SOUP_ATTRS = []
def _get_soup(self):
@@ -128,17 +127,57 @@ class SoupModelMixin:
self._soup = BeautifulSoup(html)
return self._soup
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
def elem_sort_key(elem):
sort_key = getvalue(elem[attr_name])
except (KeyError, ValueError):
sort_key = fallback
elem[attr_name] = sort_key
return sort_key
return elem_sort_key
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
def elem_pred(elem):
return test(elem[attr_name])
return elem_pred
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
if slice_args:
return itertools.islice(seq, *slice_args)
return seq
def get_description(self):
"""Return a string with a brief excerpt of body text from the HTML."""
return u''.join(self._get_soup().some_body_text())
def get_image_urls(self):
def get_image_urls(self, *slice_args):
"""Return an iterator of source URL strings of all images in the HTML.
Images include <img> sources and <video> poster attributes.
return self._get_soup().iter_image_urls()
def get_video_urls(self):
for elem in self._sort_and_slice_elems(
self._get_soup().iter_images(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
def get_one_image_url(self):
return self.get_image_urls(1)
def get_video_urls(self, *slice_args):
"""Return an iterator of source URL strings of all videos in the HTML."""
return self._get_soup().iter_video_urls()
self._get_soup().iter_videos(),
yield elem['src']
def get_one_video_url(self):
return self.get_video_urls(1)