diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-18 20:21:49 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:31 +0100 |
commit | 403b1a2d4591d878145a021a7c1e15e2d60c47d8 (patch) | |
tree | 082ddd601a58b25be4ee176fdda97f935e23ea4b | |
parent | 76d6d4d2de6580ae147e40c43c18f04cc48b62ec (diff) | |
download | fatcat-403b1a2d4591d878145a021a7c1e15e2d60c47d8.tar.gz fatcat-403b1a2d4591d878145a021a7c1e15e2d60c47d8.zip |
improve datacite field mapping and import
Current version succeeded to import a random sample of 100000 records
(0.5%) from datacite.
The --debug (write JSON to stdout) and --insert-log-file (log batch
before committing to db) flags are temporary added to help debugging.
Add few unit tests.
Some edge cases:
a) Existing keys without value requires a slightly awkward:
```
titles = attributes.get('titles', []) or []
```
b) There can be 0, 1, or more (first one wins) titles.
c) Date handling is probably not ideal. Datacite has a potentiall fine
grained list of dates.
The test case (tests/files/datacite_sample.jsonl) refers to
https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main
descriptor) 1986. The datacite record contains: 2017 (publicationYear,
probably the year of record creation with reference system), 1978-06-03
(collected, e.g. experimental sample), 1986 ("Accepted"). The online
version of the resource knows even one more date (2019-06-05 10:14:43 by
WIEWS update).
-rwxr-xr-x | python/fatcat_import.py | 15 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 180 | ||||
-rw-r--r-- | python/tests/files/datacite_1k_records.jsonl.gz | bin | 0 -> 684605 bytes | |||
-rw-r--r-- | python/tests/files/datacite_sample.jsonl | 1 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 108 |
5 files changed, 245 insertions, 59 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index d7651792..90bb01a1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -170,7 +170,10 @@ def run_datacite(args): dci = DataciteImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + debug=args.debug, + lang_detect=args.lang_detect, + insert_log_file=args.insert_log_file) if args.kafka_mode: KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite", "fatcat-import", consume_batch_size=args.batch_size).run() @@ -464,6 +467,16 @@ def main(): sub_datacite.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") + sub_datacite.add_argument('--debug', + action='store_true', + help="write converted JSON to stdout") + sub_datacite.add_argument('--lang-detect', + action='store_true', + help="try to detect language (slow)") + sub_datacite.add_argument('--insert-log-file', + default='', + type=str, + help="write inserted documents into file (for debugging)") sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_API_AUTH_TOKEN", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4e117dde..9774e334 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8 from .common import EntityImporter import dateparser -import langcodes import datetime -import langdetect import fatcat_openapi_client +import hashlib import json +import langcodes +import langdetect +import sqlite3 import sys -import hashlib # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { @@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = { class DataciteImporter(EntityImporter): """ - Importer for datacite records. TODO(martin): Do we need issn_map_file? + Importer for datacite records. """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api, issn_map_file, debug=False, lang_detect=False, + insert_log_file=None, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of Datacite DOI metadata, harvested from REST API") @@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter): **kwargs) self.create_containers = kwargs.get('create_containers', True) + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri), file=sys.stderr) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map", file=sys.stderr) + self.read_issn_map_file(issn_map_file) + self.debug = debug + self.lang_detect = lang_detect + self.insert_log_file = insert_log_file + + print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr) + + def lookup_ext_ids(self, doi): + """ + Return dictionary of identifiers refering to the same things as the given DOI. + """ + if self.extid_map_db is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def parse_record(self, obj): """ @@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter): attributes = obj['attributes'] - # Contributors. Many nameIdentifierSchemes, we do not use yet: - # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ - # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS", - # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ], + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. contribs = [] for i, c in enumerate(attributes['creators']): - if not c.get('nameType') == 'Personal': + if 'nameType' in c and not c.get('nameType') == 'Personal': continue creator_id = None for nid in c.get('nameIdentifiers', []): @@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter): if not orcid: continue creator_id = self.lookup_orcid(orcid) - # If creator_id is None, should we create creators? + # TODO(martin): If creator_id is None, should we create creators? contribs.append(fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, @@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter): # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" title, subtitle = None, None - for entry in attributes.get('titles', []): - if not title and 'titleType' not in entry: - title = entry.get('title').strip() - if entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title').strip() + titles = attributes.get('titles', []) or [] + if len(titles) == 0: + print('skipping record w/o title: {}'.format(obj), file=sys.stderr) + return False + elif len(titles) == 1: + # We do not care about the type then. + title = titles[0].get('title', '') or '' + title = title.strip() + else: + for entry in titles: + if not title and ('titleType' not in entry or not entry.get('titleType')): + title = entry.get('title').strip() + if entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + + if not title: + print('skipping record w/o title: {}'.format(obj), file=sys.stderr) + return False + + if not subtitle: + subtitle = None # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in @@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter): # "Updated", "Valid". release_year, release_date = None, None + # Ignore: Collected, Issued. date_type_prio = ( 'Valid', - 'Issued', 'Available', 'Accepted', 'Submitted', 'Copyrighted', - 'Collected', 'Created', 'Updated', ) @@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter): for item in dates: if not item.get('dateType') == prio: continue - try: - result = dateparser.parse(item.get('date')) - except TypeError as err: - print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr) - continue + + # Parse out date, use common patterns first, fallback to dateparser. + result, value, year_only = None, item.get('date', ''), False + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S') + + for pattern in common_patterns: + try: + result = datetime.datetime.strptime(value, pattern) + except ValueError: + continue + else: + if pattern == '%Y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + try: + result = dateparser.parse(value) + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + continue + if result is None: # Unparsable date. continue - release_date = result + if not year_only: + release_date = result.date() release_year = result.year if 1000 < release_year < datetime.date.today().year + 5: # Skip possibly bogus dates. @@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter): container_id = self.lookup_issnl(issnl) if container_id is None and container.get('title'): + container_title = container.get('title') + if isinstance(container_title, list): + if len(container_title) > 0: + print('too many container titles: {}'.format(len(container_title))) + container_title = container_title[0] + assert isinstance(container_title, str) ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, container_type=container_type, - name=container.get('title'), + name=container_title, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter): # closest, but not always supplied. for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): value = attributes.get('types', {}).get(typeType) - release_type = DATACITE_TYPE_MAP.get(value) + release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break if release_type is None: - print("datacite unmapped type: {}".format(release_type), file=sys.stderr) + print("no mapped type: {}".format(value), file=sys.stderr) # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you @@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter): try: language = langcodes.get(value).language except langcodes.tag_parser.LanguageTagError: - print('could not determine language: {}'.format(value), file=sys.stderr) + pass # Abstracts appear in "attributes.descriptions[].descriptionType", some # of the observed values: "Methods", "TechnicalInfo", @@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter): # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] - - for desc in attributes.get('descriptions', []): + descs = attributes.get('descriptions', []) or [] + for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue if len(desc.get('description', '')) < 10: @@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter): text = desc.get('description') sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() lang = None - try: - lang = langdetect.detect(text) - except langdetect.lang_detect_exception.LangDetectException: - pass + if self.lang_detect: + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException as err: + print('language detection failed: {}'.format(err), file=sys.stderr) abstracts.append(fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=text, @@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter): # For the moment, we only care about References. refs, ref_index = [], 0 - for rel in attributes.get('relatedIdentifiers', []): + relIds = attributes.get('relatedIdentifiers', []) or [] + for rel in relIds: if not rel.get('relationType') == 'References': continue ref_extra = dict() @@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter): if extra_datacite: extra['datacite'] = extra_datacite + doi = attributes.get('doi', '').lower() + extids = self.lookup_ext_ids(doi=doi) + # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter): release_date=release_date, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=attributes.get('doi'), + doi=doi, + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + core=extids['core_id'], + arxiv=extids['arxiv_id'], + jstor=extids['jstor_id'], ), contribs=contribs, volume=volume, @@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter): ) return re - def try_update(self, re, debug=True): + def try_update(self, re): """ - When debug is true, write the RE to stdout. + When debug is true, write the RE to stdout, not to the database. Might + hide schema mismatch bugs. """ - if debug is True: + if self.debug is True: print(json.dumps(re.to_dict(), default=extended_json_encoder)) return False @@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter): return True def insert_batch(self, batch): + print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + if self.insert_log_file: + with open(self.insert_log_file, 'a') as f: + for doc in batch: + json.dump(doc.to_dict(), f, default=extended_json_encoder) + f.write('\n') self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), + description=self.editgroup_description, + extra=self.editgroup_extra), entity_list=batch)) def extended_json_encoder(value): @@ -491,6 +588,7 @@ def extended_json_encoder(value): return value.isoformat() if isinstance(value, set): return list(value) + raise TypeError('cannot encode type: {}'.format(type(value))) def lookup_license_slug(raw): """ diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz Binary files differnew file mode 100644 index 00000000..28ea6e37 --- /dev/null +++ b/python/tests/files/datacite_1k_records.jsonl.gz diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl new file mode 100644 index 00000000..dba3e267 --- /dev/null +++ b/python/tests/files/datacite_sample.jsonl @@ -0,0 +1 @@ +{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 0bbaba2e..9c542fc6 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -1,25 +1,99 @@ """ Test datacite importer. +""" -Datacite is a aggregator, hence inputs are quite varied. +import datetime +import pytest +import gzip +from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fixtures import api +import json -Here is small sample of ID types taken from a sample: - 497344 "DOI" - 65013 "URL" - 22210 "CCDC" - 17853 "GBIF" - 17635 "Other" - 11474 "uri" - 9170 "Publisher ID" - 7775 "URN" - 6196 "DUCHAS" - 5624 "Handle" - 5056 "publisherId" +@pytest.fixture(scope="function") +def datacite_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=True) -A nice tool, not yet existing tool (maybe named indigo) would do the following: +@pytest.fixture(scope="function") +def datacite_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=False) - $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md -TODO(martin): Write tests. -""" +@pytest.mark.skip(reason="larger datacite import slows tests down") +def test_datacite_importer_huge(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 3 + + +def test_datacite_importer(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "datacite" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = False + datacite_importer.reset() + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + +def test_datacite_dict_parse(datacite_importer): + with open('tests/files/datacite_sample.jsonl', 'r') as f: + raw = json.load(f) + r = datacite_importer.parse_record(raw) + # ensure the API server is ok with format + JsonLinePusher(datacite_importer, [json.dumps(raw)]).run() + + print(r.extra) + assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert r.release_type == "article" + assert r.release_stage == "published" + assert r.license_slug == None + assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.ext_ids.doi == "10.18730/8dym9" + assert r.ext_ids.isbn13 == None + assert r.language == "enc" + assert r.subtitle == None + assert r.release_date == None + assert r.release_year == 1986 + assert 'subtitle' not in r.extra + assert 'subtitle' not in r.extra['datacite'] + assert 'funder' not in r.extra + assert 'funder' not in r.extra['datacite'] + # matched by ISSN, so shouldn't be in there + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9' + assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + assert len(r.abstracts) == 1 + assert len(r.abstracts[0].content) == 421 + assert len(r.contribs) == 1 + assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" + assert r.contribs[0].given_name == None + assert r.contribs[0].surname == None + assert len(r.refs) == 0 |