# -*- encoding: utf-8 -*-
import io
import itertools
import re
import bs4
import bs4.element
class BeautifulSoup(bs4.BeautifulSoup):
@@ -83,41 +84,39 @@ class BeautifulSoup(bs4.BeautifulSoup):
def is_video_source(elem):
try:
return elem.name == 'source' and elem.parent.name == 'video'
except AttributeError:
return False
def iter_attr(self, tag, attr_name, **kwargs):
kwargs[attr_name] = True
for elem in self.find_all(tag, **kwargs):
yield elem[attr_name]
def iter_images(self):
"""Return an iterator of all image elements in this document.
def iter_image_urls(self):
"""Return an iterator of source URL strings of all images in this document.
Images include <img> tags and <video> poster attributes.
Images include <img> and <video> with a poster attribute.
"""
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
yield elem[self.IMAGE_ATTRS[elem.name]]
elem[self.IMAGE_ATTRS[elem.name]]
except KeyError:
pass
else:
yield elem
def iter_video_urls(self):
"""Return an iterator of source URL strings of all videos in this document."""
return self.iter_attr(self.is_video_source, 'src')
def iter_videos(self):
"""Return an iterator of all video source elements in this document."""
return self.find_all(self.is_video_source, src=True)
class SoupModelMixin:
"""Mixin for models to parse HTML with BeautifulSoup.
Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
that name attributes with HTML in them. After that, all the public methods
are usable.
OG_PREVIEW_ATTR = 'data-ogpreview'
SOUP_ATTRS = []
def _get_soup(self):
return self._soup
@@ -125,20 +124,60 @@ class SoupModelMixin:
for attr_name in self.SOUP_ATTRS:
html.write(getattr(self, attr_name))
html.seek(0)
self._soup = BeautifulSoup(html)
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
def elem_sort_key(elem):
sort_key = getvalue(elem[attr_name])
except (KeyError, ValueError):
sort_key = fallback
elem[attr_name] = sort_key
return sort_key
return elem_sort_key
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
def elem_pred(elem):
return test(elem[attr_name])
return elem_pred
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
if slice_args:
return itertools.islice(seq, *slice_args)
return seq
def get_description(self):
"""Return a string with a brief excerpt of body text from the HTML."""
return u''.join(self._get_soup().some_body_text())
def get_image_urls(self):
def get_image_urls(self, *slice_args):
"""Return an iterator of source URL strings of all images in the HTML.
Images include <img> sources and <video> poster attributes.
return self._get_soup().iter_image_urls()
def get_video_urls(self):
for elem in self._sort_and_slice_elems(
self._get_soup().iter_images(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
def get_one_image_url(self):
return self.get_image_urls(1)
def get_video_urls(self, *slice_args):
"""Return an iterator of source URL strings of all videos in the HTML."""
return self._get_soup().iter_video_urls()
self._get_soup().iter_videos(),
yield elem['src']
def get_one_video_url(self):
return self.get_video_urls(1)