diff --git a/www/conservancy/apps/blog/models.py b/www/conservancy/apps/blog/models.py index 68b99caacecd029be9f2310b3f205b21b9a191e6..85dea531bbd62a0ccd1ef98b63b39e46a07db0ac 100644 --- a/www/conservancy/apps/blog/models.py +++ b/www/conservancy/apps/blog/models.py @@ -1,5 +1,6 @@ from django.db import models from django.conf import settings +from conservancy import bsoup from conservancy.apps.staff.models import Person from datetime import datetime, timedelta @@ -18,7 +19,7 @@ class EntryTag(models.Model): def get_absolute_url(self): return u"/blog/?tag=%s" % self.slug -class Entry(models.Model): +class Entry(models.Model, bsoup.SoupModelMixin): """Blog entry""" headline = models.CharField(max_length=200) @@ -38,6 +39,8 @@ class Entry(models.Model): ordering = ('-pub_date',) get_latest_by = 'pub_date' + SOUP_ATTRS = ['body'] + def __unicode__(self): return self.headline diff --git a/www/conservancy/apps/news/models.py b/www/conservancy/apps/news/models.py index 89e0cc4ce9cc19b052435feb4cf504e298cbec01..4fc5e3d93e5c8053fffc44afcbef0d6cc10aebfe 100644 --- a/www/conservancy/apps/news/models.py +++ b/www/conservancy/apps/news/models.py @@ -1,11 +1,12 @@ from django.db import models from django.conf import settings +from conservancy import bsoup from conservancy.apps.staff.models import Person from conservancy.apps.events.models import Event from django.contrib.sites.models import Site from datetime import datetime, timedelta -class PressRelease(models.Model): +class PressRelease(models.Model, bsoup.SoupModelMixin): """News release model""" headline = models.CharField(max_length=300) @@ -24,6 +25,8 @@ class PressRelease(models.Model): ordering = ("-pub_date",) get_latest_by = "pub_date" + SOUP_ATTRS = ['summary', 'body'] + def __unicode__(self): return self.headline diff --git a/www/conservancy/bsoup.py b/www/conservancy/bsoup.py new file mode 100644 index 0000000000000000000000000000000000000000..fb0ef6cb3d2ad50322ecc1a17f4212f713727a91 --- /dev/null +++ b/www/conservancy/bsoup.py @@ -0,0 +1,144 @@ +# -*- encoding: utf-8 -*- + +import io +import re + +import bs4 +import bs4.element + +class BeautifulSoup(bs4.BeautifulSoup): + """A wrapper of the original BeautifulSoup class, with convenience methods added.""" + + IMAGE_ATTRS = { + 'img': 'src', + 'video': 'poster', + } + NON_BODY_TEXT_TAGS = frozenset([ + 'img', + 'video', + ]) + SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$') + + def __init__(self, src, parser='html5lib'): + # WARNING! It seems like it would be ideal to use the 'lxml' parser + # for speed, but that doesn't work in our web application. On + # Debian stretch, at least, using lxml causes the web server WSGI + # application to go into an infinite loop. + super(BeautifulSoup, self).__init__(src, parser) + + def _body_text(self, root): + # "Body text" is all the strings under the root element, in order, + # except: + # * strings inside NON_BODY_TEXT_TAGS + # * strings inside containers of NON_BODY_TEXT_TAGS. A container is + # an element that has a NON_BODY_TEXT_TAGS element as its first child. + # For example, in
, none of the div's strings + # are included in the body text, because it's considered to be a + #