From 1c87cf9b7dfee65b6ea22e5336a0a1de168140dd Mon Sep 17 00:00:00 2001 From: bnewbold Date: Thu, 5 May 2016 17:02:48 -0400 Subject: add sitemap and headerid plugins --- plugins/sitemap/Readme.rst | 74 ++++++++++++ plugins/sitemap/__init__.py | 1 + plugins/sitemap/sitemap.py | 268 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 343 insertions(+) create mode 100644 plugins/sitemap/Readme.rst create mode 100644 plugins/sitemap/__init__.py create mode 100644 plugins/sitemap/sitemap.py (limited to 'plugins/sitemap') diff --git a/plugins/sitemap/Readme.rst b/plugins/sitemap/Readme.rst new file mode 100644 index 0000000..719c38b --- /dev/null +++ b/plugins/sitemap/Readme.rst @@ -0,0 +1,74 @@ +Sitemap +------- + +This plugin generates plain-text or XML sitemaps. You can use the ``SITEMAP`` +variable in your settings file to configure the behavior of the plugin. + +The ``SITEMAP`` variable must be a Python dictionary and can contain these keys: + +- ``format``, which sets the output format of the plugin (``xml`` or ``txt``) + +- ``priorities``, which is a dictionary with three keys: + + - ``articles``, the priority for the URLs of the articles and their + translations + + - ``pages``, the priority for the URLs of the static pages + + - ``indexes``, the priority for the URLs of the index pages, such as tags, + author pages, categories indexes, archives, etc... + + All the values of this dictionary must be decimal numbers between ``0`` and ``1``. + +- ``changefreqs``, which is a dictionary with three items: + + - ``articles``, the update frequency of the articles + + - ``pages``, the update frequency of the pages + + - ``indexes``, the update frequency of the index pages + + Valid frequency values are ``always``, ``hourly``, ``daily``, ``weekly``, ``monthly``, + ``yearly`` and ``never``. + +You can exclude URLs from being included in the sitemap via regular expressions. +For example, to exclude all URLs containing ``tag/`` or ``category/`` you can +use the following ``SITEMAP`` setting. + +.. code-block:: python + + SITEMAP = { + 'exclude': ['tag/', 'category/'] + } + +If a key is missing or a value is incorrect, it will be replaced with the +default value. + +The sitemap is saved in ``/sitemap.``. + +.. note:: + ``priorities`` and ``changefreqs`` are information for search engines. + They are only used in the XML sitemaps. + For more information: + +**Example** + +Here is an example configuration (it's also the default settings): + +.. code-block:: python + + PLUGINS=['pelican.plugins.sitemap',] + + SITEMAP = { + 'format': 'xml', + 'priorities': { + 'articles': 0.5, + 'indexes': 0.5, + 'pages': 0.5 + }, + 'changefreqs': { + 'articles': 'monthly', + 'indexes': 'daily', + 'pages': 'monthly' + } + } diff --git a/plugins/sitemap/__init__.py b/plugins/sitemap/__init__.py new file mode 100644 index 0000000..6523d3a --- /dev/null +++ b/plugins/sitemap/__init__.py @@ -0,0 +1 @@ +from .sitemap import * \ No newline at end of file diff --git a/plugins/sitemap/sitemap.py b/plugins/sitemap/sitemap.py new file mode 100644 index 0000000..8ce492a --- /dev/null +++ b/plugins/sitemap/sitemap.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- +''' +Sitemap +------- + +The sitemap plugin generates plain-text or XML sitemaps. +''' + +from __future__ import unicode_literals + +import re +import collections +import os.path + +from datetime import datetime +from logging import warning, info +from codecs import open +from pytz import timezone + +from pelican import signals, contents +from pelican.utils import get_date + +TXT_HEADER = """{0}/index.html +{0}/archives.html +{0}/tags.html +{0}/categories.html +""" + +XML_HEADER = """ + +""" + +XML_URL = """ + +{0}/{1} +{2} +{3} +{4} + +""" + +XML_FOOTER = """ + +""" + + +def format_date(date): + if date.tzinfo: + tz = date.strftime('%z') + tz = tz[:-2] + ':' + tz[-2:] + else: + tz = "-00:00" + return date.strftime("%Y-%m-%dT%H:%M:%S") + tz + +class SitemapGenerator(object): + + def __init__(self, context, settings, path, theme, output_path, *null): + + self.output_path = output_path + self.context = context + self.now = datetime.now() + self.siteurl = settings.get('SITEURL') + + + self.default_timezone = settings.get('TIMEZONE', 'UTC') + self.timezone = getattr(self, 'timezone', self.default_timezone) + self.timezone = timezone(self.timezone) + + self.format = 'xml' + + self.changefreqs = { + 'articles': 'monthly', + 'indexes': 'daily', + 'pages': 'monthly' + } + + self.priorities = { + 'articles': 0.5, + 'indexes': 0.5, + 'pages': 0.5 + } + + self.sitemapExclude = [] + + config = settings.get('SITEMAP', {}) + + if not isinstance(config, dict): + warning("sitemap plugin: the SITEMAP setting must be a dict") + else: + fmt = config.get('format') + pris = config.get('priorities') + chfreqs = config.get('changefreqs') + self.sitemapExclude = config.get('exclude', []) + + if fmt not in ('xml', 'txt'): + warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'") + warning("sitemap plugin: Setting SITEMAP['format'] on `xml'") + elif fmt == 'txt': + self.format = fmt + return + + valid_keys = ('articles', 'indexes', 'pages') + valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly', + 'yearly', 'never') + + if isinstance(pris, dict): + # We use items for Py3k compat. .iteritems() otherwise + for k, v in pris.items(): + if k in valid_keys and not isinstance(v, (int, float)): + default = self.priorities[k] + warning("sitemap plugin: priorities must be numbers") + warning("sitemap plugin: setting SITEMAP['priorities']" + "['{0}'] on {1}".format(k, default)) + pris[k] = default + self.priorities.update(pris) + elif pris is not None: + warning("sitemap plugin: SITEMAP['priorities'] must be a dict") + warning("sitemap plugin: using the default values") + + if isinstance(chfreqs, dict): + # .items() for py3k compat. + for k, v in chfreqs.items(): + if k in valid_keys and v not in valid_chfreqs: + default = self.changefreqs[k] + warning("sitemap plugin: invalid changefreq `{0}'".format(v)) + warning("sitemap plugin: setting SITEMAP['changefreqs']" + "['{0}'] on '{1}'".format(k, default)) + chfreqs[k] = default + self.changefreqs.update(chfreqs) + elif chfreqs is not None: + warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict") + warning("sitemap plugin: using the default values") + + def write_url(self, page, fd): + + if getattr(page, 'status', 'published') != 'published': + return + + # We can disable categories/authors/etc by using False instead of '' + if not page.save_as: + return + + page_path = os.path.join(self.output_path, page.save_as) + if not os.path.exists(page_path): + return + + lastdate = getattr(page, 'date', self.now) + try: + lastdate = self.get_date_modified(page, lastdate) + except ValueError: + warning("sitemap plugin: " + page.save_as + " has invalid modification date,") + warning("sitemap plugin: using date value as lastmod.") + lastmod = format_date(lastdate) + + if isinstance(page, contents.Article): + pri = self.priorities['articles'] + chfreq = self.changefreqs['articles'] + elif isinstance(page, contents.Page): + pri = self.priorities['pages'] + chfreq = self.changefreqs['pages'] + else: + pri = self.priorities['indexes'] + chfreq = self.changefreqs['indexes'] + + pageurl = '' if page.url == 'index.html' else page.url + + #Exclude URLs from the sitemap: + if self.format == 'xml': + flag = False + for regstr in self.sitemapExclude: + if re.match(regstr, pageurl): + flag = True + break + if not flag: + fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri)) + else: + fd.write(self.siteurl + '/' + pageurl + '\n') + + def get_date_modified(self, page, default): + if hasattr(page, 'modified'): + if isinstance(page.modified, datetime): + return page.modified + return get_date(page.modified) + else: + return default + + def set_url_wrappers_modification_date(self, wrappers): + for (wrapper, articles) in wrappers: + lastmod = datetime.min.replace(tzinfo=self.timezone) + for article in articles: + lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone)) + try: + modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone) + lastmod = max(lastmod, modified) + except ValueError: + # Supressed: user will be notified. + pass + setattr(wrapper, 'modified', str(lastmod)) + + def generate_output(self, writer): + path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format)) + + pages = self.context['pages'] + self.context['articles'] \ + + [ c for (c, a) in self.context['categories']] \ + + [ t for (t, a) in self.context['tags']] \ + + [ a for (a, b) in self.context['authors']] + + self.set_url_wrappers_modification_date(self.context['categories']) + self.set_url_wrappers_modification_date(self.context['tags']) + self.set_url_wrappers_modification_date(self.context['authors']) + + for article in self.context['articles']: + pages += article.translations + + info('writing {0}'.format(path)) + + with open(path, 'w', encoding='utf-8') as fd: + + if self.format == 'xml': + fd.write(XML_HEADER) + else: + fd.write(TXT_HEADER.format(self.siteurl)) + + FakePage = collections.namedtuple('FakePage', + ['status', + 'date', + 'url', + 'save_as']) + + for standard_page_url in ['index.html', + 'archives.html', + 'tags.html', + 'categories.html']: + fake = FakePage(status='published', + date=self.now, + url=standard_page_url, + save_as=standard_page_url) + self.write_url(fake, fd) + + # add template pages + # We use items for Py3k compat. .iteritems() otherwise + for path, template_page_url in self.context['TEMPLATE_PAGES'].items(): + + # don't add duplicate entry for index page + if template_page_url == 'index.html': + continue + + fake = FakePage(status='published', + date=self.now, + url=template_page_url, + save_as=template_page_url) + self.write_url(fake, fd) + + for page in pages: + self.write_url(page, fd) + + if self.format == 'xml': + fd.write(XML_FOOTER) + + +def get_generators(generators): + return SitemapGenerator + + +def register(): + signals.get_generators.connect(get_generators) -- cgit v1.2.3