Changeset - 703df9c8e97f
[Not reviewed]
0 3 0
Brett Smith (brett) - 5 years ago 2018-09-21 14:57:14
brett@sfconservancy.org
Blogs/news only include a single OG image/video.

For now, this gives us more assurance that other sites will choose the
preview we want.

You can control the selection by adding data-ogpreview to image, video, and
source elements. data-ogpreview=0 excludes the element from being included
in the preview. Positive numbers set the preview priority. The lowest
value found is chosen first.
3 files changed with 61 insertions and 22 deletions:
0 comments (0 inline, 0 general)
www/conservancy/bsoup.py
Show inline comments
 
# -*- encoding: utf-8 -*-
 

	
 
import io
 
import itertools
 
import re
 

	
 
import bs4
 
import bs4.element
 

	
 
class BeautifulSoup(bs4.BeautifulSoup):
 
    """A wrapper of the original BeautifulSoup class, with convenience methods added."""
 

	
 
    IMAGE_ATTRS = {
 
        'img': 'src',
 
        'video': 'poster',
 
    }
...
 
@@ -77,68 +78,106 @@ class BeautifulSoup(bs4.BeautifulSoup):
 
            yield s
 
            char_count += len(s)
 
            if (char_count > char_target) and self.SENTENCE_END.search(s):
 
                break
 

	
 
    @staticmethod
 
    def is_video_source(elem):
 
        try:
 
            return elem.name == 'source' and elem.parent.name == 'video'
 
        except AttributeError:
 
            return False
 

	
 
    def iter_attr(self, tag, attr_name, **kwargs):
 
        kwargs[attr_name] = True
 
        for elem in self.find_all(tag, **kwargs):
 
            yield elem[attr_name]
 
    def iter_images(self):
 
        """Return an iterator of all image elements in this document.
 

	
 
    def iter_image_urls(self):
 
        """Return an iterator of source URL strings of all images in this document.
 

	
 
        Images include <img> tags and <video> poster attributes.
 
        Images include <img> and <video> with a poster attribute.
 
        """
 
        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
 
            try:
 
                yield elem[self.IMAGE_ATTRS[elem.name]]
 
                elem[self.IMAGE_ATTRS[elem.name]]
 
            except KeyError:
 
                pass
 
            else:
 
                yield elem
 

	
 
    def iter_video_urls(self):
 
        """Return an iterator of source URL strings of all videos in this document."""
 
        return self.iter_attr(self.is_video_source, 'src')
 
    def iter_videos(self):
 
        """Return an iterator of all video source elements in this document."""
 
        return self.find_all(self.is_video_source, src=True)
 

	
 

	
 
class SoupModelMixin:
 
    """Mixin for models to parse HTML with BeautifulSoup.
 

	
 
    Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
 
    that name attributes with HTML in them.  After that, all the public methods
 
    are usable.
 
    """
 

	
 
    OG_PREVIEW_ATTR = 'data-ogpreview'
 
    SOUP_ATTRS = []
 

	
 
    def _get_soup(self):
 
        try:
 
            return self._soup
 
        except AttributeError:
 
            html = io.StringIO()
 
            for attr_name in self.SOUP_ATTRS:
 
                html.write(getattr(self, attr_name))
 
            html.seek(0)
 
            self._soup = BeautifulSoup(html)
 
            return self._soup
 

	
 
    def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
 
        def elem_sort_key(elem):
 
            try:
 
                sort_key = getvalue(elem[attr_name])
 
            except (KeyError, ValueError):
 
                sort_key = fallback
 
            elem[attr_name] = sort_key
 
            return sort_key
 
        return elem_sort_key
 

	
 
    def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
 
        def elem_pred(elem):
 
            return test(elem[attr_name])
 
        return elem_pred
 

	
 
    def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
 
        seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
 
        if slice_args:
 
            return itertools.islice(seq, *slice_args)
 
        else:
 
            return seq
 

	
 
    def get_description(self):
 
        """Return a string with a brief excerpt of body text from the HTML."""
 
        return u''.join(self._get_soup().some_body_text())
 

	
 
    def get_image_urls(self):
 
    def get_image_urls(self, *slice_args):
 
        """Return an iterator of source URL strings of all images in the HTML.
 

	
 
        Images include <img> tags and <video> poster attributes.
 
        Images include <img> sources and <video> poster attributes.
 
        """
 
        return self._get_soup().iter_image_urls()
 

	
 
    def get_video_urls(self):
 
        for elem in self._sort_and_slice_elems(
 
                self._get_soup().iter_images(),
 
                self._elem_key(),
 
                self._elem_pred(),
 
                *slice_args
 
        ):
 
            yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
 

	
 
    def get_one_image_url(self):
 
        return self.get_image_urls(1)
 

	
 
    def get_video_urls(self, *slice_args):
 
        """Return an iterator of source URL strings of all videos in the HTML."""
 
        return self._get_soup().iter_video_urls()
 
        for elem in self._sort_and_slice_elems(
 
                self._get_soup().iter_videos(),
 
                self._elem_key(),
 
                self._elem_pred(),
 
                *slice_args
 
        ):
 
            yield elem['src']
 

	
 
    def get_one_video_url(self):
 
        return self.get_video_urls(1)
www/conservancy/templates/blog/entry_detail.html
Show inline comments
 
{% extends "base_blog.html" %}
 

	
 
{% block head %}
 
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
 
{% endblock %}
 

	
 
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
 

	
 
{% block content %}
 

	
 
{% include "blog/entry_partial.html" with entry=object htag="h2" only %}
 

	
 
<p class="blog-comments">Please email any comments on this entry to
 
  <a href="mailto:info@sfconservancy.org">info@sfconservancy.org</a>.</p>
 

	
 
<p><span class="continued"><a href="/blog/">Other Conservancy Blog entries&hellip;</a></span></p>
www/conservancy/templates/news/pressrelease_detail.html
Show inline comments
 
{% extends "base_news.html" %}
 

	
 
{% block head %}
 
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
 
{% endblock %}
 

	
 
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
 

	
 
{% block content %}
 

	
 
<div id="mainContent">
 
{% include "news/pressrelease_partial.html" with pressr=object htag="h2" only %}
 
</div>
 
{% endblock %}
0 comments (0 inline, 0 general)