website Changeset - 3b2ed8397d4e · Conservancy Kallithea

Changeset - 3b2ed8397d4e

Parent rev.

Child rev.

[Not reviewed]

0 4 3

Brett Smith (brett) - 6 years ago 2017-11-07 16:17:33
brett@sfconservancy.org

blog/news: Add Open Graph metadata to entry pages.

This helps other social media sites generate nice previews for these pages.

7 files changed with 228 insertions and 2 deletions:

www/conservancy/apps/blog/models.py

www/conservancy/apps/news/models.py

www/conservancy/bsoup.py

144

www/conservancy/templates/blog/entry_detail.html

www/conservancy/templates/news/pressrelease_detail.html

www/conservancy/templates/opengraph_partial.html

www/conservancy/templates/opengraph_urllist_partial.html

0 comments (0 inline, 0 general)

www/conservancy/apps/blog/models.py

➞

Show inline comments

@@ ... / @@ -2,2 +2,3 @@ from django.db import models @@
 from django.conf import settings
 from conservancy import bsoup
 from conservancy.apps.staff.models import Person
@@ ... / @@ -20,3 +21,3 @@ class EntryTag(models.Model): @@
 class Entry(models.Model):
+class Entry(models.Model, bsoup.SoupModelMixin):
     """Blog entry"""
@@ ... / @@ -40,2 +41,4 @@ class Entry(models.Model): @@
     SOUP_ATTRS = ['body']
     def __unicode__(self):

www/conservancy/apps/news/models.py

➞

Show inline comments

@@ ... / @@ -2,2 +2,3 @@ from django.db import models @@
 from django.conf import settings
 from conservancy import bsoup
 from conservancy.apps.staff.models import Person
@@ ... / @@ -7,3 +8,3 @@ from datetime import datetime, timedelta @@
 class PressRelease(models.Model):
+class PressRelease(models.Model, bsoup.SoupModelMixin):
     """News release model"""
@@ ... / @@ -26,2 +27,4 @@ class PressRelease(models.Model): @@
     SOUP_ATTRS = ['summary', 'body']
     def __unicode__(self):

www/conservancy/bsoup.py

➞

Show inline comments

 new file 100644
 # -*- encoding: utf-8 -*-
 import io
 import re
 import bs4
 import bs4.element
 class BeautifulSoup(bs4.BeautifulSoup):
     """A wrapper of the original BeautifulSoup class, with convenience methods added."""
     IMAGE_ATTRS = {
         'img': 'src',
         'video': 'poster',
+    }
     NON_BODY_TEXT_TAGS = frozenset([
         'img',
         'video',
     ])
     SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
     def __init__(self, src, parser='html5lib'):
         # WARNING!  It seems like it would be ideal to use the 'lxml' parser
         # for speed, but that doesn't work in our web application.  On
         # Debian stretch, at least, using lxml causes the web server WSGI
         # application to go into an infinite loop.
         super(BeautifulSoup, self).__init__(src, parser)
     def _body_text(self, root):
         # "Body text" is all the strings under the root element, in order,
         # except:
         # * strings inside NON_BODY_TEXT_TAGS
         # * strings inside containers of NON_BODY_TEXT_TAGS.  A container is
         #   an element that has a NON_BODY_TEXT_TAGS element as its first child.
         #   For example, in <div> <video …> … </div>, none of the div's strings
         #   are included in the body text, because it's considered to be a
         #   <video> container, and any strings are probably a caption, fallback
         #   text, or other non-body text.
         started = False
         for child in root.children:
             child_type = type(child)
             if issubclass(child_type, bs4.element.Tag):
                 if child.name in self.NON_BODY_TEXT_TAGS:
                     if not started:
                         break
                 else:
                     for s in self._body_text(child):
                         yield s
             # It's not worth it to use issubclass here, because elements that
             # don't have body text like Comments and CDATA are subclasses of
             # NavigableString.
             elif child_type is bs4.element.NavigableString:
                 if started:
                     yield child
                 elif child.isspace():
                     pass
                 else:
                     yield child
                     started = True
     def body_text(self):
         """Return an iterator of strings comprising this document's body text."""
         return self._body_text(self)
     def some_body_text(self, char_target=300):
         """Return an iterator of strings with some of this document's body text.
         This is the same as body_text, except after it yields a string that
         looks like the end of a sentence, it checks whether it has yielded
         at least `char_target` characters.  If so, the iterator stops.
         """
         # This implementation is likely to overshoot `char_target` a lot,
         # because it doesn't look inside the strings it yields, just at the
         # end of them.  We can implement something smarter later if needed.
         char_count = 0
         for s in self.body_text():
             yield s
             char_count += len(s)
             if (char_count > char_target) and self.SENTENCE_END.search(s):
                 break
     @staticmethod
     def is_video_source(elem):
         try:
             return elem.name == 'source' and elem.parent.name == 'video'
         except AttributeError:
             return False
     def iter_attr(self, tag, attr_name, **kwargs):
         kwargs[attr_name] = True
         for elem in self.find_all(tag, **kwargs):
             yield elem[attr_name]
     def iter_image_urls(self):
         """Return an iterator of source URL strings of all images in this document.
         Images include <img> tags and <video> poster attributes.
         """
         for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
             try:
                 yield elem[self.IMAGE_ATTRS[elem.name]]
             except KeyError:
                 pass
     def iter_video_urls(self):
         """Return an iterator of source URL strings of all videos in this document."""
         return self.iter_attr(self.is_video_source, 'src')
 class SoupModelMixin:
     """Mixin for models to parse HTML with BeautifulSoup.
     Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
     that name attributes with HTML in them.  After that, all the public methods
     are usable.
     """
     SOUP_ATTRS = []
     def _get_soup(self):
         try:
             return self._soup
         except AttributeError:
             html = io.StringIO()
             for attr_name in self.SOUP_ATTRS:
                 html.write(getattr(self, attr_name))
             html.seek(0)
             self._soup = BeautifulSoup(html)
             return self._soup
     def get_description(self):
         """Return a string with a brief excerpt of body text from the HTML."""
         return u''.join(self._get_soup().some_body_text())
     def get_image_urls(self):
         """Return an iterator of source URL strings of all images in the HTML.
         Images include <img> tags and <video> poster attributes.
         """
         return self._get_soup().iter_image_urls()
     def get_video_urls(self):
         """Return an iterator of source URL strings of all videos in the HTML."""
         return self._get_soup().iter_video_urls()

www/conservancy/templates/blog/entry_detail.html

➞

Show inline comments

@@ ... / @@ -2,2 +2,8 @@ @@
 {% block head %}
 {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 {% endblock %}
 {% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}

www/conservancy/templates/news/pressrelease_detail.html

➞

Show inline comments

@@ ... / @@ -2,2 +2,8 @@ @@
 {% block head %}
 {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
 {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
 {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
 {% endblock %}
 {% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}

www/conservancy/templates/opengraph_partial.html

➞

Show inline comments

 new file 100644
 {% comment %}
 Include this partial in a head section to include basic Open Graph metadata.
 Pass a variable `NAME` to give a value for the `og:NAME` property.
 These properties are only listed if you give a value for them:
 * url: A URL string that includes at least an absolute path.  This partial
   will fill in a default scheme and host if needed.
 * title: A string.  Tags are stripped, then the rest is assumed HTML-safe.
 * description: A string.  Tags are stripped, then the rest is assumed
   HTML-safe.
 These properties are always included.  You can override them but you
 normally shouldn't need to:
 * type: Default "website".
 * locale: Default "en_US".
 * site_name: Default "Software Freedom Conservancy"
 {% endcomment %}
 <meta property="og:type" content="{{ type|default:"website" }}">
 <meta property="og:locale" content="{{ locale|default:"en_US" }}">
 <meta property="og:site_name" content="{{ site_name|default:"Software Freedom Conservancy" }}">
 {% if url %}
 {% load fill_url %}
 <meta property="og:url" content="{{ url|fill_url:host_url }}">
 {% endif %}
 {% if title %}
 <meta property="og:title" content="{{ title|striptags|safe }}">
 {% endif %}
 {% if description %}
 <meta property="og:description" content="{{ description|striptags|safe }}">
 {% endif %}

www/conservancy/templates/opengraph_urllist_partial.html

➞

Show inline comments

 new file 100644
 {% comment %}
 Include this partial in a head section to include a series of URLs for a
 given property, like og:image or og:video.
 You must pass the following variables:
 * property: A string with the name of the property, like 'image' or 'video'.
 * urls: A sequence of URL strings.  Each should include at least an absolute
   path.  This partial will fill in a scheme and host if needed.
 You may also pass:
 * fallback: A URL string, following the same rules as in `urls`.  This URL
   will be used if `urls` is empty.
 {% endcomment %}
 {% load fill_url %}
 {% for url in urls %}
 <meta property="og:{{ property }}" content="{{ url|fill_url:host_url }}">
 {% empty %}
 {% if fallback %}
 <meta property="og:{{ property }}" content="{{ fallback|fill_url:host_url }}">
 {% endif %}
 {% endfor %}

0 comments (0 inline, 0 general)