Changeset - 3b2ed8397d4e
[Not reviewed]
0 4 3
Brett Smith (brett) - 4 years ago 2017-11-07 16:17:33
brett@sfconservancy.org
blog/news: Add Open Graph metadata to entry pages.

This helps other social media sites generate nice previews for these pages.
7 files changed with 228 insertions and 2 deletions:
0 comments (0 inline, 0 general)
www/conservancy/apps/blog/models.py
Show inline comments
 
from django.db import models
 
from django.conf import settings
 
from conservancy import bsoup
 
from conservancy.apps.staff.models import Person
 
from datetime import datetime, timedelta
 

	
...
 
@@ -18,7 +19,7 @@ class EntryTag(models.Model):
 
    def get_absolute_url(self):
 
        return u"/blog/?tag=%s" % self.slug
 

	
 
class Entry(models.Model):
 
class Entry(models.Model, bsoup.SoupModelMixin):
 
    """Blog entry"""
 

	
 
    headline = models.CharField(max_length=200)
...
 
@@ -38,6 +39,8 @@ class Entry(models.Model):
 
        ordering = ('-pub_date',)
 
        get_latest_by = 'pub_date'
 

	
 
    SOUP_ATTRS = ['body']
 

	
 
    def __unicode__(self):
 
        return self.headline
 

	
www/conservancy/apps/news/models.py
Show inline comments
 
from django.db import models
 
from django.conf import settings
 
from conservancy import bsoup
 
from conservancy.apps.staff.models import Person
 
from conservancy.apps.events.models import Event
 
from django.contrib.sites.models import Site
 
from datetime import datetime, timedelta
 

	
 
class PressRelease(models.Model):
 
class PressRelease(models.Model, bsoup.SoupModelMixin):
 
    """News release model"""
 

	
 
    headline = models.CharField(max_length=300)
...
 
@@ -24,6 +25,8 @@ class PressRelease(models.Model):
 
        ordering = ("-pub_date",)
 
        get_latest_by = "pub_date"
 

	
 
    SOUP_ATTRS = ['summary', 'body']
 

	
 
    def __unicode__(self):
 
        return self.headline
 

	
www/conservancy/bsoup.py
Show inline comments
 
new file 100644
 
# -*- encoding: utf-8 -*-
 

	
 
import io
 
import re
 

	
 
import bs4
 
import bs4.element
 

	
 
class BeautifulSoup(bs4.BeautifulSoup):
 
    """A wrapper of the original BeautifulSoup class, with convenience methods added."""
 

	
 
    IMAGE_ATTRS = {
 
        'img': 'src',
 
        'video': 'poster',
 
    }
 
    NON_BODY_TEXT_TAGS = frozenset([
 
        'img',
 
        'video',
 
    ])
 
    SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
 

	
 
    def __init__(self, src, parser='html5lib'):
 
        # WARNING!  It seems like it would be ideal to use the 'lxml' parser
 
        # for speed, but that doesn't work in our web application.  On
 
        # Debian stretch, at least, using lxml causes the web server WSGI
 
        # application to go into an infinite loop.
 
        super(BeautifulSoup, self).__init__(src, parser)
 

	
 
    def _body_text(self, root):
 
        # "Body text" is all the strings under the root element, in order,
 
        # except:
 
        # * strings inside NON_BODY_TEXT_TAGS
 
        # * strings inside containers of NON_BODY_TEXT_TAGS.  A container is
 
        #   an element that has a NON_BODY_TEXT_TAGS element as its first child.
 
        #   For example, in <div> <video …> … </div>, none of the div's strings
 
        #   are included in the body text, because it's considered to be a
 
        #   <video> container, and any strings are probably a caption, fallback
 
        #   text, or other non-body text.
 
        started = False
 
        for child in root.children:
 
            child_type = type(child)
 
            if issubclass(child_type, bs4.element.Tag):
 
                if child.name in self.NON_BODY_TEXT_TAGS:
 
                    if not started:
 
                        break
 
                else:
 
                    for s in self._body_text(child):
 
                        yield s
 
            # It's not worth it to use issubclass here, because elements that
 
            # don't have body text like Comments and CDATA are subclasses of
 
            # NavigableString.
 
            elif child_type is bs4.element.NavigableString:
 
                if started:
 
                    yield child
 
                elif child.isspace():
 
                    pass
 
                else:
 
                    yield child
 
                    started = True
 

	
 
    def body_text(self):
 
        """Return an iterator of strings comprising this document's body text."""
 
        return self._body_text(self)
 

	
 
    def some_body_text(self, char_target=300):
 
        """Return an iterator of strings with some of this document's body text.
 

	
 
        This is the same as body_text, except after it yields a string that
 
        looks like the end of a sentence, it checks whether it has yielded
 
        at least `char_target` characters.  If so, the iterator stops.
 
        """
 
        # This implementation is likely to overshoot `char_target` a lot,
 
        # because it doesn't look inside the strings it yields, just at the
 
        # end of them.  We can implement something smarter later if needed.
 
        char_count = 0
 
        for s in self.body_text():
 
            yield s
 
            char_count += len(s)
 
            if (char_count > char_target) and self.SENTENCE_END.search(s):
 
                break
 

	
 
    @staticmethod
 
    def is_video_source(elem):
 
        try:
 
            return elem.name == 'source' and elem.parent.name == 'video'
 
        except AttributeError:
 
            return False
 

	
 
    def iter_attr(self, tag, attr_name, **kwargs):
 
        kwargs[attr_name] = True
 
        for elem in self.find_all(tag, **kwargs):
 
            yield elem[attr_name]
 

	
 
    def iter_image_urls(self):
 
        """Return an iterator of source URL strings of all images in this document.
 

	
 
        Images include <img> tags and <video> poster attributes.
 
        """
 
        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
 
            try:
 
                yield elem[self.IMAGE_ATTRS[elem.name]]
 
            except KeyError:
 
                pass
 

	
 
    def iter_video_urls(self):
 
        """Return an iterator of source URL strings of all videos in this document."""
 
        return self.iter_attr(self.is_video_source, 'src')
 

	
 

	
 
class SoupModelMixin:
 
    """Mixin for models to parse HTML with BeautifulSoup.
 

	
 
    Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
 
    that name attributes with HTML in them.  After that, all the public methods
 
    are usable.
 
    """
 

	
 
    SOUP_ATTRS = []
 

	
 
    def _get_soup(self):
 
        try:
 
            return self._soup
 
        except AttributeError:
 
            html = io.StringIO()
 
            for attr_name in self.SOUP_ATTRS:
 
                html.write(getattr(self, attr_name))
 
            html.seek(0)
 
            self._soup = BeautifulSoup(html)
 
            return self._soup
 

	
 
    def get_description(self):
 
        """Return a string with a brief excerpt of body text from the HTML."""
 
        return u''.join(self._get_soup().some_body_text())
 

	
 
    def get_image_urls(self):
 
        """Return an iterator of source URL strings of all images in the HTML.
 

	
 
        Images include <img> tags and <video> poster attributes.
 
        """
 
        return self._get_soup().iter_image_urls()
 

	
 
    def get_video_urls(self):
 
        """Return an iterator of source URL strings of all videos in the HTML."""
 
        return self._get_soup().iter_video_urls()
www/conservancy/templates/blog/entry_detail.html
Show inline comments
 
{% extends "base_blog.html" %}
 

	
 
{% block head %}
 
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 
{% endblock %}
 

	
 
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
 

	
 
{% block content %}
www/conservancy/templates/news/pressrelease_detail.html
Show inline comments
 
{% extends "base_news.html" %}
 

	
 
{% block head %}
 
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 
{% endblock %}
 

	
 
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
 

	
 
{% block content %}
www/conservancy/templates/opengraph_partial.html
Show inline comments
 
new file 100644
 
{% comment %}
 

	
 
Include this partial in a head section to include basic Open Graph metadata.
 
Pass a variable `NAME` to give a value for the `og:NAME` property.
 

	
 
These properties are only listed if you give a value for them:
 

	
 
* url: A URL string that includes at least an absolute path.  This partial
 
  will fill in a default scheme and host if needed.
 
* title: A string.  Tags are stripped, then the rest is assumed HTML-safe.
 
* description: A string.  Tags are stripped, then the rest is assumed
 
  HTML-safe.
 

	
 
These properties are always included.  You can override them but you
 
normally shouldn't need to:
 

	
 
* type: Default "website".
 
* locale: Default "en_US".
 
* site_name: Default "Software Freedom Conservancy"
 

	
 
{% endcomment %}
 

	
 
<meta property="og:type" content="{{ type|default:"website" }}">
 
<meta property="og:locale" content="{{ locale|default:"en_US" }}">
 
<meta property="og:site_name" content="{{ site_name|default:"Software Freedom Conservancy" }}">
 

	
 
{% if url %}
 
{% load fill_url %}
 
<meta property="og:url" content="{{ url|fill_url:host_url }}">
 
{% endif %}
 

	
 
{% if title %}
 
<meta property="og:title" content="{{ title|striptags|safe }}">
 
{% endif %}
 

	
 
{% if description %}
 
<meta property="og:description" content="{{ description|striptags|safe }}">
 
{% endif %}
www/conservancy/templates/opengraph_urllist_partial.html
Show inline comments
 
new file 100644
 
{% comment %}
 

	
 
Include this partial in a head section to include a series of URLs for a
 
given property, like og:image or og:video.
 

	
 
You must pass the following variables:
 

	
 
* property: A string with the name of the property, like 'image' or 'video'.
 
* urls: A sequence of URL strings.  Each should include at least an absolute
 
  path.  This partial will fill in a scheme and host if needed.
 

	
 
You may also pass:
 

	
 
* fallback: A URL string, following the same rules as in `urls`.  This URL
 
  will be used if `urls` is empty.
 

	
 
{% endcomment %}
 

	
 
{% load fill_url %}
 
{% for url in urls %}
 
<meta property="og:{{ property }}" content="{{ url|fill_url:host_url }}">
 
{% empty %}
 
{% if fallback %}
 
<meta property="og:{{ property }}" content="{{ fallback|fill_url:host_url }}">
 
{% endif %}
 
{% endfor %}
0 comments (0 inline, 0 general)