diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 26 | ||||
-rw-r--r-- | python/fatcat_web/entity_helpers.py | 6 | ||||
-rw-r--r-- | python/fatcat_web/templates/entity_base.html | 3 | ||||
-rw-r--r-- | python/tests/harvest_crossref.py | 2 |
4 files changed, 33 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d2d71d3c..3acb7d96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session class HarvestCrossrefWorker: """ - Notes on crossref API: + Crossref API date fields (and our interpretation):: - - from-index-date is the updated time + - https://github.com/CrossRef/rest-api-doc#filter-names + - *-index-date: "metadata indexed" is the API/index record update time + - *-deposit-date: "metadata last (re)deposited" is the catalog record update time + - *-update-date: "Metadata updated (Currently the same as *-deposit-date)" + - *-created-date: "metadata first deposited" + - *-pub-date (etc): publisher-supplied, not "meta-meta-data" https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2 + Also from the REST API: + + Notes on incremental metadata updates + + When using time filters to retrieve periodic, incremental metadata + updates, the from-index-date filter should be used over + from-update-date, from-deposit-date, from-created-date and + from-pub-date. The timestamp that from-index-date filters on is + guaranteed to be updated every time there is a change to metadata + requiring a reindex. + + However, when Crossref re-indexes tens of millions of rows, using + from-index-date can be very slow, taking several days to process a single + day of updates. + I think the design is going to have to be a cronjob or long-running job (with long sleeps) which publishes "success through" to a separate state queue, as simple YYYY-MM-DD strings. @@ -87,7 +107,7 @@ class HarvestCrossrefWorker: return Producer(producer_conf) def params(self, date_str): - filter_param = 'from-index-date:{},until-index-date:{}'.format( + filter_param = 'from-update-date:{},until-update-date:{}'.format( date_str, date_str) return { 'filter': filter_param, diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py index 4d13da43..d82ea0e9 100644 --- a/python/fatcat_web/entity_helpers.py +++ b/python/fatcat_web/entity_helpers.py @@ -72,6 +72,12 @@ def enrich_release_entity(entity): # November 1. if ref.extra and ref.extra.get('unstructured'): ref.extra['unstructured'] = strip_extlink_xml(ref.extra['unstructured']) + # for backwards compatability, copy extra['subtitle'] to subtitle + if not entity.subtitle and entity.extra and entity.extra.get('subtitle'): + if isinstance(entity.extra['subtitle'], str): + entity.subtitle = entity.extra['subtitle'] + elif isinstance(entity.extra['subtitle'], list): + entity.subtitle = entity.extra['subtitle'][0] or None # author list to display; ensure it's sorted by index (any othors with # index=None go to end of list) authors = [c for c in entity.contribs if diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html index 437bc071..f30df0da 100644 --- a/python/fatcat_web/templates/entity_base.html +++ b/python/fatcat_web/templates/entity_base.html @@ -26,6 +26,9 @@ <h1 class="ui header"> {% if entity_type == "container" %} {{ entity.name }} + {% if entity.extra.original_name %} + <br><span style="font-size: smaller; font-weight: normal;">{{ entity.extra.original_name }}</span> + {% endif %} {% elif entity_type == "creator" %} {{ entity.display_name }} {% elif entity_type == "file" %} diff --git a/python/tests/harvest_crossref.py b/python/tests/harvest_crossref.py index 52aa7b81..e902cda5 100644 --- a/python/tests/harvest_crossref.py +++ b/python/tests/harvest_crossref.py @@ -36,7 +36,7 @@ def test_crossref_harvest_date(mocker): assert "mailto:test@fatcat.wiki" in responses.calls[0].request.headers['User-Agent'] # check that correct date param was passed as expected - assert "filter=from-index-date%3A2019-02-03" in responses.calls[0].request.url + assert "filter=from-update-date%3A2019-02-03" in responses.calls[0].request.url # check that we published the expected number of DOI objects were published # to the (mock) kafka topic |