diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2017-08-30 21:04:39 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2017-08-30 21:10:40 -0700 | 
| commit | 381e40f2b9c87b6a9359f2c79cf4687064625288 (patch) | |
| tree | 5542cd9ae24ac604958f09686c9b53923452d459 /plugins/sitemap | |
| download | archive3k.org-381e40f2b9c87b6a9359f2c79cf4687064625288.tar.gz archive3k.org-381e40f2b9c87b6a9359f2c79cf4687064625288.zip | |
bootstrap 3 + pelican template repo
Diffstat (limited to 'plugins/sitemap')
| -rw-r--r-- | plugins/sitemap/Readme.rst | 74 | ||||
| -rw-r--r-- | plugins/sitemap/__init__.py | 1 | ||||
| -rw-r--r-- | plugins/sitemap/sitemap.py | 268 | 
3 files changed, 343 insertions, 0 deletions
| diff --git a/plugins/sitemap/Readme.rst b/plugins/sitemap/Readme.rst new file mode 100644 index 0000000..719c38b --- /dev/null +++ b/plugins/sitemap/Readme.rst @@ -0,0 +1,74 @@ +Sitemap +------- + +This plugin generates plain-text or XML sitemaps. You can use the ``SITEMAP`` +variable in your settings file to configure the behavior of the plugin. + +The ``SITEMAP`` variable must be a Python dictionary and can contain these keys: + +- ``format``, which sets the output format of the plugin (``xml`` or ``txt``) + +- ``priorities``, which is a dictionary with three keys: + +  - ``articles``, the priority for the URLs of the articles and their +    translations + +  - ``pages``, the priority for the URLs of the static pages + +  - ``indexes``, the priority for the URLs of the index pages, such as tags, +     author pages, categories indexes, archives, etc... + +  All the values of this dictionary must be decimal numbers between ``0`` and ``1``. + +- ``changefreqs``, which is a dictionary with three items: + +  - ``articles``, the update frequency of the articles + +  - ``pages``, the update frequency of the pages + +  - ``indexes``, the update frequency of the index pages + +  Valid frequency values are ``always``, ``hourly``, ``daily``, ``weekly``, ``monthly``, +  ``yearly`` and ``never``. + +You can exclude URLs from being included in the sitemap via regular expressions. +For example, to exclude all URLs containing ``tag/`` or ``category/`` you can +use the following ``SITEMAP`` setting. + +.. code-block:: python + +    SITEMAP = { +        'exclude': ['tag/', 'category/'] +    } + +If a key is missing or a value is incorrect, it will be replaced with the +default value. + +The sitemap is saved in ``<output_path>/sitemap.<format>``. + +.. note:: +   ``priorities`` and ``changefreqs`` are information for search engines. +   They are only used in the XML sitemaps. +   For more information: <http://www.sitemaps.org/protocol.html#xmlTagDefinitions> + +**Example** + +Here is an example configuration (it's also the default settings): + +.. code-block:: python + +    PLUGINS=['pelican.plugins.sitemap',] + +    SITEMAP = { +        'format': 'xml', +        'priorities': { +            'articles': 0.5, +            'indexes': 0.5, +            'pages': 0.5 +        }, +        'changefreqs': { +            'articles': 'monthly', +            'indexes': 'daily', +            'pages': 'monthly' +        } +    } diff --git a/plugins/sitemap/__init__.py b/plugins/sitemap/__init__.py new file mode 100644 index 0000000..6523d3a --- /dev/null +++ b/plugins/sitemap/__init__.py @@ -0,0 +1 @@ +from .sitemap import *
\ No newline at end of file diff --git a/plugins/sitemap/sitemap.py b/plugins/sitemap/sitemap.py new file mode 100644 index 0000000..8ce492a --- /dev/null +++ b/plugins/sitemap/sitemap.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- +''' +Sitemap +------- + +The sitemap plugin generates plain-text or XML sitemaps. +''' + +from __future__ import unicode_literals + +import re +import collections +import os.path + +from datetime import datetime +from logging import warning, info +from codecs import open +from pytz import timezone + +from pelican import signals, contents +from pelican.utils import get_date + +TXT_HEADER = """{0}/index.html +{0}/archives.html +{0}/tags.html +{0}/categories.html +""" + +XML_HEADER = """<?xml version="1.0" encoding="utf-8"?> +<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" +xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +""" + +XML_URL = """ +<url> +<loc>{0}/{1}</loc> +<lastmod>{2}</lastmod> +<changefreq>{3}</changefreq> +<priority>{4}</priority> +</url> +""" + +XML_FOOTER = """ +</urlset> +""" + + +def format_date(date): +    if date.tzinfo: +        tz = date.strftime('%z') +        tz = tz[:-2] + ':' + tz[-2:] +    else: +        tz = "-00:00" +    return date.strftime("%Y-%m-%dT%H:%M:%S") + tz + +class SitemapGenerator(object): + +    def __init__(self, context, settings, path, theme, output_path, *null): + +        self.output_path = output_path +        self.context = context +        self.now = datetime.now() +        self.siteurl = settings.get('SITEURL') + + +        self.default_timezone = settings.get('TIMEZONE', 'UTC') +        self.timezone = getattr(self, 'timezone', self.default_timezone) +        self.timezone = timezone(self.timezone) + +        self.format = 'xml' + +        self.changefreqs = { +            'articles': 'monthly', +            'indexes': 'daily', +            'pages': 'monthly' +        } + +        self.priorities = { +            'articles': 0.5, +            'indexes': 0.5, +            'pages': 0.5 +        } + +        self.sitemapExclude = [] + +        config = settings.get('SITEMAP', {}) + +        if not isinstance(config, dict): +            warning("sitemap plugin: the SITEMAP setting must be a dict") +        else: +            fmt = config.get('format') +            pris = config.get('priorities') +            chfreqs = config.get('changefreqs') +            self.sitemapExclude = config.get('exclude', []) + +            if fmt not in ('xml', 'txt'): +                warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'") +                warning("sitemap plugin: Setting SITEMAP['format'] on `xml'") +            elif fmt == 'txt': +                self.format = fmt +                return + +            valid_keys = ('articles', 'indexes', 'pages') +            valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly', +                    'yearly', 'never') + +            if isinstance(pris, dict): +                # We use items for Py3k compat. .iteritems() otherwise +                for k, v in pris.items(): +                    if k in valid_keys and not isinstance(v, (int, float)): +                        default = self.priorities[k] +                        warning("sitemap plugin: priorities must be numbers") +                        warning("sitemap plugin: setting SITEMAP['priorities']" +                                "['{0}'] on {1}".format(k, default)) +                        pris[k] = default +                self.priorities.update(pris) +            elif pris is not None: +                warning("sitemap plugin: SITEMAP['priorities'] must be a dict") +                warning("sitemap plugin: using the default values") + +            if isinstance(chfreqs, dict): +                # .items() for py3k compat. +                for k, v in chfreqs.items(): +                    if k in valid_keys and v not in valid_chfreqs: +                        default = self.changefreqs[k] +                        warning("sitemap plugin: invalid changefreq `{0}'".format(v)) +                        warning("sitemap plugin: setting SITEMAP['changefreqs']" +                                "['{0}'] on '{1}'".format(k, default)) +                        chfreqs[k] = default +                self.changefreqs.update(chfreqs) +            elif chfreqs is not None: +                warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict") +                warning("sitemap plugin: using the default values") + +    def write_url(self, page, fd): + +        if getattr(page, 'status', 'published') != 'published': +            return + +        # We can disable categories/authors/etc by using False instead of '' +        if not page.save_as: +            return + +        page_path = os.path.join(self.output_path, page.save_as) +        if not os.path.exists(page_path): +            return + +        lastdate = getattr(page, 'date', self.now) +        try: +            lastdate = self.get_date_modified(page, lastdate) +        except ValueError: +            warning("sitemap plugin: " + page.save_as + " has invalid modification date,") +            warning("sitemap plugin: using date value as lastmod.") +        lastmod = format_date(lastdate) + +        if isinstance(page, contents.Article): +            pri = self.priorities['articles'] +            chfreq = self.changefreqs['articles'] +        elif isinstance(page, contents.Page): +            pri = self.priorities['pages'] +            chfreq = self.changefreqs['pages'] +        else: +            pri = self.priorities['indexes'] +            chfreq = self.changefreqs['indexes'] + +        pageurl = '' if page.url == 'index.html' else page.url + +        #Exclude URLs from the sitemap: +        if self.format == 'xml': +            flag = False +            for regstr in self.sitemapExclude: +                if re.match(regstr, pageurl): +                    flag = True +                    break +            if not flag: +                fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri)) +        else: +            fd.write(self.siteurl + '/' + pageurl + '\n') + +    def get_date_modified(self, page, default): +        if hasattr(page, 'modified'): +            if isinstance(page.modified, datetime): +                return page.modified +            return get_date(page.modified) +        else: +            return default + +    def set_url_wrappers_modification_date(self, wrappers): +        for (wrapper, articles) in wrappers: +            lastmod = datetime.min.replace(tzinfo=self.timezone) +            for article in articles: +                lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone)) +                try: +                    modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone) +                    lastmod = max(lastmod, modified) +                except ValueError: +                    # Supressed: user will be notified. +                    pass +            setattr(wrapper, 'modified', str(lastmod)) + +    def generate_output(self, writer): +        path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format)) + +        pages = self.context['pages'] + self.context['articles'] \ +                + [ c for (c, a) in self.context['categories']] \ +                + [ t for (t, a) in self.context['tags']] \ +                + [ a for (a, b) in self.context['authors']] + +        self.set_url_wrappers_modification_date(self.context['categories']) +        self.set_url_wrappers_modification_date(self.context['tags']) +        self.set_url_wrappers_modification_date(self.context['authors']) + +        for article in self.context['articles']: +            pages += article.translations + +        info('writing {0}'.format(path)) + +        with open(path, 'w', encoding='utf-8') as fd: + +            if self.format == 'xml': +                fd.write(XML_HEADER) +            else: +                fd.write(TXT_HEADER.format(self.siteurl)) + +            FakePage = collections.namedtuple('FakePage', +                                              ['status', +                                               'date', +                                               'url', +                                               'save_as']) + +            for standard_page_url in ['index.html', +                                      'archives.html', +                                      'tags.html', +                                      'categories.html']: +                fake = FakePage(status='published', +                                date=self.now, +                                url=standard_page_url, +                                save_as=standard_page_url) +                self.write_url(fake, fd) + +            # add template pages +            # We use items for Py3k compat. .iteritems() otherwise +            for path, template_page_url in self.context['TEMPLATE_PAGES'].items(): + +                # don't add duplicate entry for index page +                if template_page_url == 'index.html': +                    continue + +                fake = FakePage(status='published', +                                date=self.now, +                                url=template_page_url, +                                save_as=template_page_url) +                self.write_url(fake, fd) + +            for page in pages: +                self.write_url(page, fd) + +            if self.format == 'xml': +                fd.write(XML_FOOTER) + + +def get_generators(generators): +    return SitemapGenerator + + +def register(): +    signals.get_generators.connect(get_generators) | 
