aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/fatcat_import.py15
-rw-r--r--python/fatcat_tools/importers/datacite.py180
-rw-r--r--python/tests/files/datacite_1k_records.jsonl.gzbin0 -> 684605 bytes
-rw-r--r--python/tests/files/datacite_sample.jsonl1
-rw-r--r--python/tests/import_datacite.py108
5 files changed, 245 insertions, 59 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index d7651792..90bb01a1 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -170,7 +170,10 @@ def run_datacite(args):
dci = DataciteImporter(args.api,
args.issn_map_file,
edit_batch_size=args.batch_size,
- bezerk_mode=args.bezerk_mode)
+ bezerk_mode=args.bezerk_mode,
+ debug=args.debug,
+ lang_detect=args.lang_detect,
+ insert_log_file=args.insert_log_file)
if args.kafka_mode:
KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
"fatcat-import", consume_batch_size=args.batch_size).run()
@@ -464,6 +467,16 @@ def main():
sub_datacite.add_argument('--bezerk-mode',
action='store_true',
help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+ sub_datacite.add_argument('--debug',
+ action='store_true',
+ help="write converted JSON to stdout")
+ sub_datacite.add_argument('--lang-detect',
+ action='store_true',
+ help="try to detect language (slow)")
+ sub_datacite.add_argument('--insert-log-file',
+ default='',
+ type=str,
+ help="write inserted documents into file (for debugging)")
sub_datacite.set_defaults(
func=run_datacite,
auth_var="FATCAT_API_AUTH_TOKEN",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4e117dde..9774e334 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
from .common import EntityImporter
import dateparser
-import langcodes
import datetime
-import langdetect
import fatcat_openapi_client
+import hashlib
import json
+import langcodes
+import langdetect
+import sqlite3
import sys
-import hashlib
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
@@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = {
class DataciteImporter(EntityImporter):
"""
- Importer for datacite records. TODO(martin): Do we need issn_map_file?
+ Importer for datacite records.
"""
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
+ insert_log_file=None, **kwargs):
eg_desc = kwargs.get('editgroup_description',
"Automated import of Datacite DOI metadata, harvested from REST API")
@@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter):
**kwargs)
self.create_containers = kwargs.get('create_containers', True)
+ extid_map_file = kwargs.get('extid_map_file')
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri), file=sys.stderr)
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map", file=sys.stderr)
+
self.read_issn_map_file(issn_map_file)
+ self.debug = debug
+ self.lang_detect = lang_detect
+ self.insert_log_file = insert_log_file
+
+ print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+
+ def lookup_ext_ids(self, doi):
+ """
+ Return dictionary of identifiers refering to the same things as the given DOI.
+ """
+ if self.extid_map_db is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+ [doi.lower()]).fetchone()
+ if row is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=row[0],
+ pmid=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
def parse_record(self, obj):
"""
@@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter):
attributes = obj['attributes']
- # Contributors. Many nameIdentifierSchemes, we do not use yet:
- # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
- # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
- # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
+ # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+ # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+ # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+ # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
contribs = []
for i, c in enumerate(attributes['creators']):
- if not c.get('nameType') == 'Personal':
+ if 'nameType' in c and not c.get('nameType') == 'Personal':
continue
creator_id = None
for nid in c.get('nameIdentifiers', []):
@@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter):
if not orcid:
continue
creator_id = self.lookup_orcid(orcid)
- # If creator_id is None, should we create creators?
+ # TODO(martin): If creator_id is None, should we create creators?
contribs.append(fatcat_openapi_client.ReleaseContrib(
creator_id=creator_id,
index=i,
@@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter):
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
title, subtitle = None, None
- for entry in attributes.get('titles', []):
- if not title and 'titleType' not in entry:
- title = entry.get('title').strip()
- if entry.get('titleType') == 'Subtitle':
- subtitle = entry.get('title').strip()
+ titles = attributes.get('titles', []) or []
+ if len(titles) == 0:
+ print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+ return False
+ elif len(titles) == 1:
+ # We do not care about the type then.
+ title = titles[0].get('title', '') or ''
+ title = title.strip()
+ else:
+ for entry in titles:
+ if not title and ('titleType' not in entry or not entry.get('titleType')):
+ title = entry.get('title').strip()
+ if entry.get('titleType') == 'Subtitle':
+ subtitle = entry.get('title', '').strip()
+
+ if not title:
+ print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+ return False
+
+ if not subtitle:
+ subtitle = None
# Dates. A few internal dates (registered, created, updated) and
# published (0..2554). We try to work with typed date list, in
@@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter):
# "Updated", "Valid".
release_year, release_date = None, None
+ # Ignore: Collected, Issued.
date_type_prio = (
'Valid',
- 'Issued',
'Available',
'Accepted',
'Submitted',
'Copyrighted',
- 'Collected',
'Created',
'Updated',
)
@@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter):
for item in dates:
if not item.get('dateType') == prio:
continue
- try:
- result = dateparser.parse(item.get('date'))
- except TypeError as err:
- print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
- continue
+
+ # Parse out date, use common patterns first, fallback to dateparser.
+ result, value, year_only = None, item.get('date', ''), False
+
+ # Before using (expensive) dateparser, try a few common patterns.
+ common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S')
+
+ for pattern in common_patterns:
+ try:
+ result = datetime.datetime.strptime(value, pattern)
+ except ValueError:
+ continue
+ else:
+ if pattern == '%Y':
+ year_only = True
+ break
+
+ if result is None:
+ print('fallback for {}'.format(value), file=sys.stderr)
+ try:
+ result = dateparser.parse(value)
+ except TypeError as err:
+ print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+ continue
+
if result is None:
# Unparsable date.
continue
- release_date = result
+ if not year_only:
+ release_date = result.date()
release_year = result.year
if 1000 < release_year < datetime.date.today().year + 5:
# Skip possibly bogus dates.
@@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter):
container_id = self.lookup_issnl(issnl)
if container_id is None and container.get('title'):
+ container_title = container.get('title')
+ if isinstance(container_title, list):
+ if len(container_title) > 0:
+ print('too many container titles: {}'.format(len(container_title)))
+ container_title = container_title[0]
+ assert isinstance(container_title, str)
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
container_type=container_type,
- name=container.get('title'),
+ name=container_title,
)
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
@@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter):
# closest, but not always supplied.
for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
value = attributes.get('types', {}).get(typeType)
- release_type = DATACITE_TYPE_MAP.get(value)
+ release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
if release_type is not None:
break
if release_type is None:
- print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+ print("no mapped type: {}".format(value), file=sys.stderr)
# Language values are varied ("ger", "es", "English", "ENG", "en-us",
# "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter):
try:
language = langcodes.get(value).language
except langcodes.tag_parser.LanguageTagError:
- print('could not determine language: {}'.format(value), file=sys.stderr)
+ pass
# Abstracts appear in "attributes.descriptions[].descriptionType", some
# of the observed values: "Methods", "TechnicalInfo",
@@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter):
# "Other" fields might contain references or related articles (with
# DOI). TODO(martin): maybe try to parse out some of those refs.
abstracts = []
-
- for desc in attributes.get('descriptions', []):
+ descs = attributes.get('descriptions', []) or []
+ for desc in descs:
if not desc.get('descriptionType') == 'Abstract':
continue
if len(desc.get('description', '')) < 10:
@@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter):
text = desc.get('description')
sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
lang = None
- try:
- lang = langdetect.detect(text)
- except langdetect.lang_detect_exception.LangDetectException:
- pass
+ if self.lang_detect:
+ try:
+ lang = langdetect.detect(text)
+ except langdetect.lang_detect_exception.LangDetectException as err:
+ print('language detection failed: {}'.format(err), file=sys.stderr)
abstracts.append(fatcat_openapi_client.ReleaseAbstract(
mimetype="text/plain",
content=text,
@@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter):
# For the moment, we only care about References.
refs, ref_index = [], 0
- for rel in attributes.get('relatedIdentifiers', []):
+ relIds = attributes.get('relatedIdentifiers', []) or []
+ for rel in relIds:
if not rel.get('relationType') == 'References':
continue
ref_extra = dict()
@@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter):
if extra_datacite:
extra['datacite'] = extra_datacite
+ doi = attributes.get('doi', '').lower()
+ extids = self.lookup_ext_ids(doi=doi)
+
# Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter):
release_date=release_date,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=attributes.get('doi'),
+ doi=doi,
+ pmid=extids['pmid'],
+ pmcid=extids['pmcid'],
+ wikidata_qid=extids['wikidata_qid'],
+ core=extids['core_id'],
+ arxiv=extids['arxiv_id'],
+ jstor=extids['jstor_id'],
),
contribs=contribs,
volume=volume,
@@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter):
)
return re
- def try_update(self, re, debug=True):
+ def try_update(self, re):
"""
- When debug is true, write the RE to stdout.
+ When debug is true, write the RE to stdout, not to the database. Might
+ hide schema mismatch bugs.
"""
- if debug is True:
+ if self.debug is True:
print(json.dumps(re.to_dict(), default=extended_json_encoder))
return False
@@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter):
return True
def insert_batch(self, batch):
+ print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+ if self.insert_log_file:
+ with open(self.insert_log_file, 'a') as f:
+ for doc in batch:
+ json.dump(doc.to_dict(), f, default=extended_json_encoder)
+ f.write('\n')
self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
entity_list=batch))
def extended_json_encoder(value):
@@ -491,6 +588,7 @@ def extended_json_encoder(value):
return value.isoformat()
if isinstance(value, set):
return list(value)
+ raise TypeError('cannot encode type: {}'.format(type(value)))
def lookup_license_slug(raw):
"""
diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz
new file mode 100644
index 00000000..28ea6e37
--- /dev/null
+++ b/python/tests/files/datacite_1k_records.jsonl.gz
Binary files differ
diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl
new file mode 100644
index 00000000..dba3e267
--- /dev/null
+++ b/python/tests/files/datacite_sample.jsonl
@@ -0,0 +1 @@
+{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 0bbaba2e..9c542fc6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -1,25 +1,99 @@
"""
Test datacite importer.
+"""
-Datacite is a aggregator, hence inputs are quite varied.
+import datetime
+import pytest
+import gzip
+from fatcat_tools.importers import DataciteImporter, JsonLinePusher
+from fixtures import api
+import json
-Here is small sample of ID types taken from a sample:
- 497344 "DOI"
- 65013 "URL"
- 22210 "CCDC"
- 17853 "GBIF"
- 17635 "Other"
- 11474 "uri"
- 9170 "Publisher ID"
- 7775 "URN"
- 6196 "DUCHAS"
- 5624 "Handle"
- 5056 "publisherId"
+@pytest.fixture(scope="function")
+def datacite_importer(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+ bezerk_mode=True)
-A nice tool, not yet existing tool (maybe named indigo) would do the following:
+@pytest.fixture(scope="function")
+def datacite_importer_existing(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+ bezerk_mode=False)
- $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
-TODO(martin): Write tests.
-"""
+@pytest.mark.skip(reason="larger datacite import slows tests down")
+def test_datacite_importer_huge(datacite_importer):
+ last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+ with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+ datacite_importer.bezerk_mode = True
+ counts = JsonLinePusher(datacite_importer, f).run()
+ assert counts['insert'] == 998
+ change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+ release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+ assert len(release.contribs) == 3
+
+
+def test_datacite_importer(datacite_importer):
+ last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ datacite_importer.bezerk_mode = True
+ counts = JsonLinePusher(datacite_importer, f).run()
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "datacite" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+
+ last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ datacite_importer.bezerk_mode = False
+ datacite_importer.reset()
+ counts = JsonLinePusher(datacite_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 1
+ assert counts['skip'] == 0
+ assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
+def test_datacite_dict_parse(datacite_importer):
+ with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ raw = json.load(f)
+ r = datacite_importer.parse_record(raw)
+ # ensure the API server is ok with format
+ JsonLinePusher(datacite_importer, [json.dumps(raw)]).run()
+
+ print(r.extra)
+ assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+ assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+ assert r.release_type == "article"
+ assert r.release_stage == "published"
+ assert r.license_slug == None
+ assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+ assert r.ext_ids.doi == "10.18730/8dym9"
+ assert r.ext_ids.isbn13 == None
+ assert r.language == "enc"
+ assert r.subtitle == None
+ assert r.release_date == None
+ assert r.release_year == 1986
+ assert 'subtitle' not in r.extra
+ assert 'subtitle' not in r.extra['datacite']
+ assert 'funder' not in r.extra
+ assert 'funder' not in r.extra['datacite']
+ # matched by ISSN, so shouldn't be in there
+ #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+ assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9'
+ assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+ assert len(r.abstracts) == 1
+ assert len(r.abstracts[0].content) == 421
+ assert len(r.contribs) == 1
+ assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
+ assert r.contribs[0].given_name == None
+ assert r.contribs[0].surname == None
+ assert len(r.refs) == 0