add sitemap and headerid plugins

author: bnewbold <bnewbold@robocracy.org> 2016-05-05 17:02:48 -0400
committer: bnewbold <bnewbold@robocracy.org> 2016-05-05 17:02:48 -0400
commit: 1c87cf9b7dfee65b6ea22e5336a0a1de168140dd (patch)
tree: 5df4c522251bccc27eef3c898bdf5e59219bbf25 /plugins/sitemap
parent: 19aba9065649a9b41fe82783a48cb056304bd847 (diff)
download: bnewnet-1c87cf9b7dfee65b6ea22e5336a0a1de168140dd.tar.gz
bnewnet-1c87cf9b7dfee65b6ea22e5336a0a1de168140dd.zip
3 files changed, 343 insertions, 0 deletions
diff --git a/plugins/sitemap/Readme.rst b/plugins/sitemap/Readme.rst
new file mode 100644
index 0000000..719c38b
--- /dev/null
+++ b/plugins/sitemap/Readme.rst
@@ -0,0 +1,74 @@
+Sitemap
+-------
+
+This plugin generates plain-text or XML sitemaps. You can use the ``SITEMAP``
+variable in your settings file to configure the behavior of the plugin.
+
+The ``SITEMAP`` variable must be a Python dictionary and can contain these keys:
+
+- ``format``, which sets the output format of the plugin (``xml`` or ``txt``)
+
+- ``priorities``, which is a dictionary with three keys:
+
+  - ``articles``, the priority for the URLs of the articles and their
+    translations
+
+  - ``pages``, the priority for the URLs of the static pages
+
+  - ``indexes``, the priority for the URLs of the index pages, such as tags,
+     author pages, categories indexes, archives, etc...
+
+  All the values of this dictionary must be decimal numbers between ``0`` and ``1``.
+
+- ``changefreqs``, which is a dictionary with three items:
+
+  - ``articles``, the update frequency of the articles
+
+  - ``pages``, the update frequency of the pages
+
+  - ``indexes``, the update frequency of the index pages
+
+  Valid frequency values are ``always``, ``hourly``, ``daily``, ``weekly``, ``monthly``,
+  ``yearly`` and ``never``.
+
+You can exclude URLs from being included in the sitemap via regular expressions.
+For example, to exclude all URLs containing ``tag/`` or ``category/`` you can
+use the following ``SITEMAP`` setting.
+
+.. code-block:: python
+
+    SITEMAP = {
+        'exclude': ['tag/', 'category/']
+    }
+
+If a key is missing or a value is incorrect, it will be replaced with the
+default value.
+
+The sitemap is saved in ``<output_path>/sitemap.<format>``.
+
+.. note::
+   ``priorities`` and ``changefreqs`` are information for search engines.
+   They are only used in the XML sitemaps.
+   For more information: <http://www.sitemaps.org/protocol.html#xmlTagDefinitions>
+
+**Example**
+
+Here is an example configuration (it's also the default settings):
+
+.. code-block:: python
+
+    PLUGINS=['pelican.plugins.sitemap',]
+
+    SITEMAP = {
+        'format': 'xml',
+        'priorities': {
+            'articles': 0.5,
+            'indexes': 0.5,
+            'pages': 0.5
+        },
+        'changefreqs': {
+            'articles': 'monthly',
+            'indexes': 'daily',
+            'pages': 'monthly'
+        }
+    }
diff --git a/plugins/sitemap/__init__.py b/plugins/sitemap/__init__.py
new file mode 100644
index 0000000..6523d3a
--- /dev/null
+++ b/plugins/sitemap/__init__.py
@@ -0,0 +1 @@
+from .sitemap import *
+\ No newline at end of file
diff --git a/plugins/sitemap/sitemap.py b/plugins/sitemap/sitemap.py
new file mode 100644
index 0000000..8ce492a
--- /dev/null
+++ b/plugins/sitemap/sitemap.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+'''
+Sitemap
+-------
+
+The sitemap plugin generates plain-text or XML sitemaps.
+'''
+
+from __future__ import unicode_literals
+
+import re
+import collections
+import os.path
+
+from datetime import datetime
+from logging import warning, info
+from codecs import open
+from pytz import timezone
+
+from pelican import signals, contents
+from pelican.utils import get_date
+
+TXT_HEADER = """{0}/index.html
+{0}/archives.html
+{0}/tags.html
+{0}/categories.html
+"""
+
+XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
+xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+"""
+
+XML_URL = """
+<url>
+<loc>{0}/{1}</loc>
+<lastmod>{2}</lastmod>
+<changefreq>{3}</changefreq>
+<priority>{4}</priority>
+</url>
+"""
+
+XML_FOOTER = """
+</urlset>
+"""
+
+
+def format_date(date):
+    if date.tzinfo:
+        tz = date.strftime('%z')
+        tz = tz[:-2] + ':' + tz[-2:]
+    else:
+        tz = "-00:00"
+    return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
+
+class SitemapGenerator(object):
+
+    def __init__(self, context, settings, path, theme, output_path, *null):
+
+        self.output_path = output_path
+        self.context = context
+        self.now = datetime.now()
+        self.siteurl = settings.get('SITEURL')
+
+
+        self.default_timezone = settings.get('TIMEZONE', 'UTC')
+        self.timezone = getattr(self, 'timezone', self.default_timezone)
+        self.timezone = timezone(self.timezone)
+
+        self.format = 'xml'
+
+        self.changefreqs = {
+            'articles': 'monthly',
+            'indexes': 'daily',
+            'pages': 'monthly'
+        }
+
+        self.priorities = {
+            'articles': 0.5,
+            'indexes': 0.5,
+            'pages': 0.5
+        }
+
+        self.sitemapExclude = []
+
+        config = settings.get('SITEMAP', {})
+
+        if not isinstance(config, dict):
+            warning("sitemap plugin: the SITEMAP setting must be a dict")
+        else:
+            fmt = config.get('format')
+            pris = config.get('priorities')
+            chfreqs = config.get('changefreqs')
+            self.sitemapExclude = config.get('exclude', [])
+
+            if fmt not in ('xml', 'txt'):
+                warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
+                warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
+            elif fmt == 'txt':
+                self.format = fmt
+                return
+
+            valid_keys = ('articles', 'indexes', 'pages')
+            valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
+                    'yearly', 'never')
+
+            if isinstance(pris, dict):
+                # We use items for Py3k compat. .iteritems() otherwise
+                for k, v in pris.items():
+                    if k in valid_keys and not isinstance(v, (int, float)):
+                        default = self.priorities[k]
+                        warning("sitemap plugin: priorities must be numbers")
+                        warning("sitemap plugin: setting SITEMAP['priorities']"
+                                "['{0}'] on {1}".format(k, default))
+                        pris[k] = default
+                self.priorities.update(pris)
+            elif pris is not None:
+                warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
+                warning("sitemap plugin: using the default values")
+
+            if isinstance(chfreqs, dict):
+                # .items() for py3k compat.
+                for k, v in chfreqs.items():
+                    if k in valid_keys and v not in valid_chfreqs:
+                        default = self.changefreqs[k]
+                        warning("sitemap plugin: invalid changefreq `{0}'".format(v))
+                        warning("sitemap plugin: setting SITEMAP['changefreqs']"
+                                "['{0}'] on '{1}'".format(k, default))
+                        chfreqs[k] = default
+                self.changefreqs.update(chfreqs)
+            elif chfreqs is not None:
+                warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
+                warning("sitemap plugin: using the default values")
+
+    def write_url(self, page, fd):
+
+        if getattr(page, 'status', 'published') != 'published':
+            return
+
+        # We can disable categories/authors/etc by using False instead of ''
+        if not page.save_as:
+            return
+
+        page_path = os.path.join(self.output_path, page.save_as)
+        if not os.path.exists(page_path):
+            return
+
+        lastdate = getattr(page, 'date', self.now)
+        try:
+            lastdate = self.get_date_modified(page, lastdate)
+        except ValueError:
+            warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
+            warning("sitemap plugin: using date value as lastmod.")
+        lastmod = format_date(lastdate)
+
+        if isinstance(page, contents.Article):
+            pri = self.priorities['articles']
+            chfreq = self.changefreqs['articles']
+        elif isinstance(page, contents.Page):
+            pri = self.priorities['pages']
+            chfreq = self.changefreqs['pages']
+        else:
+            pri = self.priorities['indexes']
+            chfreq = self.changefreqs['indexes']
+
+        pageurl = '' if page.url == 'index.html' else page.url
+
+        #Exclude URLs from the sitemap:
+        if self.format == 'xml':
+            flag = False
+            for regstr in self.sitemapExclude:
+                if re.match(regstr, pageurl):
+                    flag = True
+                    break
+            if not flag:
+                fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
+        else:
+            fd.write(self.siteurl + '/' + pageurl + '\n')
+
+    def get_date_modified(self, page, default):
+        if hasattr(page, 'modified'):
+            if isinstance(page.modified, datetime):
+                return page.modified
+            return get_date(page.modified)
+        else:
+            return default
+
+    def set_url_wrappers_modification_date(self, wrappers):
+        for (wrapper, articles) in wrappers:
+            lastmod = datetime.min.replace(tzinfo=self.timezone)
+            for article in articles:
+                lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
+                try:
+                    modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
+                    lastmod = max(lastmod, modified)
+                except ValueError:
+                    # Supressed: user will be notified.
+                    pass
+            setattr(wrapper, 'modified', str(lastmod))
+
+    def generate_output(self, writer):
+        path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
+
+        pages = self.context['pages'] + self.context['articles'] \
+                + [ c for (c, a) in self.context['categories']] \
+                + [ t for (t, a) in self.context['tags']] \
+                + [ a for (a, b) in self.context['authors']]
+
+        self.set_url_wrappers_modification_date(self.context['categories'])
+        self.set_url_wrappers_modification_date(self.context['tags'])
+        self.set_url_wrappers_modification_date(self.context['authors'])
+
+        for article in self.context['articles']:
+            pages += article.translations
+
+        info('writing {0}'.format(path))
+
+        with open(path, 'w', encoding='utf-8') as fd:
+
+            if self.format == 'xml':
+                fd.write(XML_HEADER)
+            else:
+                fd.write(TXT_HEADER.format(self.siteurl))
+
+            FakePage = collections.namedtuple('FakePage',
+                                              ['status',
+                                               'date',
+                                               'url',
+                                               'save_as'])
+
+            for standard_page_url in ['index.html',
+                                      'archives.html',
+                                      'tags.html',
+                                      'categories.html']:
+                fake = FakePage(status='published',
+                                date=self.now,
+                                url=standard_page_url,
+                                save_as=standard_page_url)
+                self.write_url(fake, fd)
+
+            # add template pages
+            # We use items for Py3k compat. .iteritems() otherwise
+            for path, template_page_url in self.context['TEMPLATE_PAGES'].items():
+
+                # don't add duplicate entry for index page
+                if template_page_url == 'index.html':
+                    continue
+
+                fake = FakePage(status='published',
+                                date=self.now,
+                                url=template_page_url,
+                                save_as=template_page_url)
+                self.write_url(fake, fd)
+
+            for page in pages:
+                self.write_url(page, fd)
+
+            if self.format == 'xml':
+                fd.write(XML_FOOTER)
+
+
+def get_generators(generators):
+    return SitemapGenerator
+
+
+def register():
+    signals.get_generators.connect(get_generators)
author	bnewbold <bnewbold@robocracy.org>	2016-05-05 17:02:48 -0400
committer	bnewbold <bnewbold@robocracy.org>	2016-05-05 17:02:48 -0400
commit	1c87cf9b7dfee65b6ea22e5336a0a1de168140dd (patch)
tree	5df4c522251bccc27eef3c898bdf5e59219bbf25 /plugins/sitemap
parent	19aba9065649a9b41fe82783a48cb056304bd847 (diff)
download	bnewnet-1c87cf9b7dfee65b6ea22e5336a0a1de168140dd.tar.gz bnewnet-1c87cf9b7dfee65b6ea22e5336a0a1de168140dd.zip