File diff 5c75decd30a1 → 703df9c8e97f
Show inline comments
# -*- encoding: utf-8 -*-

import io
import itertools
import re

import bs4
import bs4.element

class BeautifulSoup(bs4.BeautifulSoup):
@@ -83,41 +84,39 @@ class BeautifulSoup(bs4.BeautifulSoup):
    def is_video_source(elem):
            return == 'source' and == 'video'
        except AttributeError:
            return False

    def iter_attr(self, tag, attr_name, **kwargs):
        kwargs[attr_name] = True
        for elem in self.find_all(tag, **kwargs):
            yield elem[attr_name]
    def iter_images(self):
        """Return an iterator of all image elements in this document.

    def iter_image_urls(self):
        """Return an iterator of source URL strings of all images in this document.

        Images include <img> tags and <video> poster attributes.
        Images include <img> and <video> with a poster attribute.
        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
                yield elem[self.IMAGE_ATTRS[]]
            except KeyError:
                yield elem

    def iter_video_urls(self):
        """Return an iterator of source URL strings of all videos in this document."""
        return self.iter_attr(self.is_video_source, 'src')
    def iter_videos(self):
        """Return an iterator of all video source elements in this document."""
        return self.find_all(self.is_video_source, src=True)


class SoupModelMixin:
    """Mixin for models to parse HTML with BeautifulSoup.

    Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
    that name attributes with HTML in them.  After that, all the public methods
    are usable.

    OG_PREVIEW_ATTR = 'data-ogpreview'
    SOUP_ATTRS = []

    def _get_soup(self):
            return self._soup
        except AttributeError:
@@ -125,20 +124,60 @@ class SoupModelMixin:
            for attr_name in self.SOUP_ATTRS:
                html.write(getattr(self, attr_name))
            self._soup = BeautifulSoup(html)
            return self._soup

    def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
        def elem_sort_key(elem):
                sort_key = getvalue(elem[attr_name])
            except (KeyError, ValueError):
                sort_key = fallback
            elem[attr_name] = sort_key
            return sort_key
        return elem_sort_key

    def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
        def elem_pred(elem):
            return test(elem[attr_name])
        return elem_pred

    def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
        seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
        if slice_args:
            return itertools.islice(seq, *slice_args)
            return seq

    def get_description(self):
        """Return a string with a brief excerpt of body text from the HTML."""
        return u''.join(self._get_soup().some_body_text())

    def get_image_urls(self):
    def get_image_urls(self, *slice_args):
        """Return an iterator of source URL strings of all images in the HTML.

        Images include <img> tags and <video> poster attributes.
        Images include <img> sources and <video> poster attributes.
        return self._get_soup().iter_image_urls()

    def get_video_urls(self):
        for elem in self._sort_and_slice_elems(
            yield elem[BeautifulSoup.IMAGE_ATTRS[]]

    def get_one_image_url(self):
        return self.get_image_urls(1)

    def get_video_urls(self, *slice_args):
        """Return an iterator of source URL strings of all videos in the HTML."""
        return self._get_soup().iter_video_urls()
        for elem in self._sort_and_slice_elems(
            yield elem['src']

    def get_one_video_url(self):
        return self.get_video_urls(1)