aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py26
-rw-r--r--python/fatcat_web/entity_helpers.py6
-rw-r--r--python/fatcat_web/templates/entity_base.html3
-rw-r--r--python/tests/harvest_crossref.py2
4 files changed, 33 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d2d71d3c..3acb7d96 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session
class HarvestCrossrefWorker:
"""
- Notes on crossref API:
+ Crossref API date fields (and our interpretation)::
- - from-index-date is the updated time
+ - https://github.com/CrossRef/rest-api-doc#filter-names
+ - *-index-date: "metadata indexed" is the API/index record update time
+ - *-deposit-date: "metadata last (re)deposited" is the catalog record update time
+ - *-update-date: "Metadata updated (Currently the same as *-deposit-date)"
+ - *-created-date: "metadata first deposited"
+ - *-pub-date (etc): publisher-supplied, not "meta-meta-data"
https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2
+ Also from the REST API:
+
+ Notes on incremental metadata updates
+
+ When using time filters to retrieve periodic, incremental metadata
+ updates, the from-index-date filter should be used over
+ from-update-date, from-deposit-date, from-created-date and
+ from-pub-date. The timestamp that from-index-date filters on is
+ guaranteed to be updated every time there is a change to metadata
+ requiring a reindex.
+
+ However, when Crossref re-indexes tens of millions of rows, using
+ from-index-date can be very slow, taking several days to process a single
+ day of updates.
+
I think the design is going to have to be a cronjob or long-running job
(with long sleeps) which publishes "success through" to a separate state
queue, as simple YYYY-MM-DD strings.
@@ -87,7 +107,7 @@ class HarvestCrossrefWorker:
return Producer(producer_conf)
def params(self, date_str):
- filter_param = 'from-index-date:{},until-index-date:{}'.format(
+ filter_param = 'from-update-date:{},until-update-date:{}'.format(
date_str, date_str)
return {
'filter': filter_param,
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index 4d13da43..d82ea0e9 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -72,6 +72,12 @@ def enrich_release_entity(entity):
# November 1.
if ref.extra and ref.extra.get('unstructured'):
ref.extra['unstructured'] = strip_extlink_xml(ref.extra['unstructured'])
+ # for backwards compatability, copy extra['subtitle'] to subtitle
+ if not entity.subtitle and entity.extra and entity.extra.get('subtitle'):
+ if isinstance(entity.extra['subtitle'], str):
+ entity.subtitle = entity.extra['subtitle']
+ elif isinstance(entity.extra['subtitle'], list):
+ entity.subtitle = entity.extra['subtitle'][0] or None
# author list to display; ensure it's sorted by index (any othors with
# index=None go to end of list)
authors = [c for c in entity.contribs if
diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html
index 437bc071..f30df0da 100644
--- a/python/fatcat_web/templates/entity_base.html
+++ b/python/fatcat_web/templates/entity_base.html
@@ -26,6 +26,9 @@
<h1 class="ui header">
{% if entity_type == "container" %}
{{ entity.name }}
+ {% if entity.extra.original_name %}
+ <br><span style="font-size: smaller; font-weight: normal;">{{ entity.extra.original_name }}</span>
+ {% endif %}
{% elif entity_type == "creator" %}
{{ entity.display_name }}
{% elif entity_type == "file" %}
diff --git a/python/tests/harvest_crossref.py b/python/tests/harvest_crossref.py
index 52aa7b81..e902cda5 100644
--- a/python/tests/harvest_crossref.py
+++ b/python/tests/harvest_crossref.py
@@ -36,7 +36,7 @@ def test_crossref_harvest_date(mocker):
assert "mailto:test@fatcat.wiki" in responses.calls[0].request.headers['User-Agent']
# check that correct date param was passed as expected
- assert "filter=from-index-date%3A2019-02-03" in responses.calls[0].request.url
+ assert "filter=from-update-date%3A2019-02-03" in responses.calls[0].request.url
# check that we published the expected number of DOI objects were published
# to the (mock) kafka topic