diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-21 23:30:56 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:31 +0100 |
commit | a196435a0e88f85785742cdd089344f97401b43a (patch) | |
tree | 056dceaa4ccd567096b2d3e789efdd573682a8c3 | |
parent | 52eabd48658a676ac4577d1c8da31df1fe58093e (diff) | |
download | fatcat-a196435a0e88f85785742cdd089344f97401b43a.tar.gz fatcat-a196435a0e88f85785742cdd089344f97401b43a.zip |
address first round of MR14 comments
* add missing langdetect
* use entity_to_dict for json debug output
* factor out code for fields in function and add table driven tests
* update citeproc types
* add author as default role
* add raw_affiliation
* include relations from datacite
* remove url (covered by doi already)
Using yapf for python formatting.
-rw-r--r-- | python/Pipfile | 1 | ||||
-rw-r--r-- | python/Pipfile.lock | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 467 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 178 |
4 files changed, 503 insertions, 150 deletions
diff --git a/python/Pipfile b/python/Pipfile index dfb87514..6325c180 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -49,6 +49,7 @@ elasticsearch-dsl = ">=6.0.0,<7.0.0" elasticsearch = ">=6.0.0,<7.0.0" langcodes = ">=1.4" dateparser = ">=0.7" +langdetect = "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index b6e066b5..f0f60aa8 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -306,6 +306,13 @@ "index": "pypi", "version": "==1.4.1" }, + "langdetect": { + "hashes": [ + "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30" + ], + "index": "pypi", + "version": "==1.0.7" + }, "loginpass": { "hashes": [ "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 77ce1012..19b89edf 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -14,6 +14,7 @@ import langcodes import langdetect import sqlite3 import sys +from fatcat_tools.transforms import entity_to_dict # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { @@ -55,16 +56,42 @@ DATACITE_TYPE_MAP = { 'Thesis': 'thesis', }, 'citeproc': { - 'dataset': 'dataset', - 'chapter': 'chapter', - 'article-journal': 'article-journal', - 'song': 'song', 'article': 'article', - 'report': 'report', + 'article-journal': 'article-journal', + 'article-magazine': 'article-magazine', + 'article-newspaper': 'article-newspaper', + 'bill': 'bill', + 'book': 'book', + 'broadcast': 'broadcast', + 'chapter': 'chapter', + 'dataset': 'dataset', + 'entry-dictionary': 'entry-dictionary', + 'entry-encyclopedia': 'entry-encyclopedia', + 'entry': 'entry', + 'figure': 'figure', 'graphic': 'graphic', + 'interview': 'interview', + 'legal_case': 'legal_case', + 'legislation': 'legislation', + 'manuscript': 'manuscript', + 'map': 'map', + 'motion_picture': 'motion_picture', + 'musical_score': 'musical_score', + 'pamphlet': 'pamphlet', + 'paper-conference': 'paper-conference', + 'patent': 'patent', + 'personal_communication': 'personal_communication', + 'post': 'post', + 'post-weblog': 'post-weblog', + 'report': 'report', + 'review-book': 'review-book', + 'review': 'review', + 'song': 'song', + 'speech': 'speech', 'thesis': 'thesis', - 'book': 'book', - }, + 'treaty': 'treaty', + 'webpage': 'webpage', + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types 'bibtex': { 'phdthesis': 'thesis', 'inbook': 'chapter', @@ -88,7 +115,6 @@ DATACITE_TYPE_MAP = { } } - # TODO(martin): merge this with other maps, maybe. LICENSE_SLUG_MAP = { "//creativecommons.org/licenses/by/2.0/": "CC-BY", @@ -124,7 +150,8 @@ LICENSE_SLUG_MAP = { "//www.karger.com/Services/SiteLicenses": "KARGER", "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", - "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2 + "//www.opensource.org/licenses/EUPL-1.1": + "EUPL-1.1", # redirects to EUPL-1.2 "//www.opensource.org/licenses/MIT": "MIT", # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ # "http://rsc.li/journals-terms-of-use": "RSC", @@ -146,23 +173,31 @@ LICENSE_SLUG_MAP = { # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice). } + class DataciteImporter(EntityImporter): """ Importer for datacite records. """ - - def __init__(self, api, issn_map_file, debug=False, lang_detect=False, - insert_log_file=None, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of Datacite DOI metadata, harvested from REST API") + def __init__(self, + api, + issn_map_file, + debug=False, + lang_detect=False, + insert_log_file=None, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of Datacite DOI metadata, harvested from REST API" + ) eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter') + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DataciteImporter') super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) self.create_containers = kwargs.get('create_containers', True) extid_map_file = kwargs.get('extid_map_file') @@ -179,18 +214,31 @@ class DataciteImporter(EntityImporter): self.lang_detect = lang_detect self.insert_log_file = insert_log_file - print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr) + print('datacite with debug={}, lang_detect={}'.format( + self.debug, self.lang_detect), + file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers refering to the same things as the given DOI. """ if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], @@ -206,6 +254,8 @@ class DataciteImporter(EntityImporter): """ Mapping datacite JSON to ReleaseEntity. """ + if not obj or not isinstance(obj, dict): + return None if 'attributes' not in obj: return None @@ -218,43 +268,54 @@ class DataciteImporter(EntityImporter): contribs = [] for i, c in enumerate(attributes['creators']): - if 'nameType' in c and not c.get('nameType') == 'Personal': - continue - creator_id = None - for nid in c.get('nameIdentifiers', []): - if not nid.get('nameIdentifierScheme').lower() == "orcid": + nameType = c.get('nameType', '') or '' + if nameType == 'Personal': + creator_id = None + for nid in c.get('nameIdentifiers', []): + if not nid.get('nameIdentifierScheme').lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', + '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = affiliations[0] + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=c.get('name'), + given_name=c.get('givenName'), + surname=c.get('familyName'), + role='author', + raw_affiliation=raw_affiliation, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name == 'NN': continue - orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') - if not orcid: + if len(name) < 3: continue - creator_id = self.lookup_orcid(orcid) - # TODO(martin): If creator_id is None, should we create creators? - contribs.append(fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=c.get('name'), - given_name=c.get('givenName'), - surname=c.get('familyName'), - )) + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('unknown name type: {}'.format(nameType), file=sys.stderr) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" - title, subtitle = None, None - titles = attributes.get('titles', []) or [] - if len(titles) == 0: - print('skipping record w/o title: {}'.format(obj), file=sys.stderr) - return False - elif len(titles) == 1: - # We do not care about the type then. - title = titles[0].get('title', '') or '' - title = title.strip() - else: - for entry in titles: - if not title and ('titleType' not in entry or not entry.get('titleType')): - title = entry.get('title').strip() - if entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title', '').strip() + title, original_language_title, subtitle = parse_datacite_titles( + titles) if not title: print('skipping record w/o title: {}'.format(obj), file=sys.stderr) @@ -268,67 +329,14 @@ class DataciteImporter(EntityImporter): # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". - release_year, release_date = None, None - - # Ignore: Collected, Issued. - date_type_prio = ( - 'Valid', - 'Available', - 'Accepted', - 'Submitted', - 'Copyrighted', - 'Created', - 'Updated', - ) - - # Before using (expensive) dateparser, try a few common patterns. - common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y') - - for prio in date_type_prio: - dates = attributes.get('dates', []) or [] # Never be None. - for item in dates: - if not item.get('dateType') == prio: - continue - - # Parse out date, use common patterns first, fallback to dateparser. - result, value, year_only = None, item.get('date', ''), False - - for pattern in common_patterns: - try: - result = datetime.datetime.strptime(value, pattern) - except ValueError: - continue - else: - if pattern == '%Y': - year_only = True - break - - if result is None: - print('fallback for {}'.format(value), file=sys.stderr) - try: - result = dateparser.parse(value) - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - continue - - if result is None: - # Unparsable date. - continue - if not year_only: - release_date = result.date() - release_year = result.year - if 1000 < release_year < datetime.date.today().year + 5: - # Skip possibly bogus dates. - continue - break - else: - continue - break + release_date, release_year = parse_datacite_dates( + attributes.get('dates', [])) # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') - if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'): + if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', + '(:none)'): publisher = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, @@ -345,7 +353,8 @@ class DataciteImporter(EntityImporter): container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) - if container.get('identifier') and container.get('identifierType') == 'ISSN': + if container.get('identifier') and container.get( + 'identifierType') == 'ISSN': issn = container.get('identifier') if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] @@ -357,7 +366,8 @@ class DataciteImporter(EntityImporter): container_title = container.get('title') if isinstance(container_title, list): if len(container_title) > 0: - print('too many container titles: {}'.format(len(container_title))) + print('too many container titles: {}'.format( + len(container_title))) container_title = container_title[0] assert isinstance(container_title, str) ce = fatcat_openapi_client.ContainerEntity( @@ -404,7 +414,8 @@ class DataciteImporter(EntityImporter): # types supplied in datacite. The "attributes.types.resourceType" # contains too many (176 in sample) things for now; citeproc may be the # closest, but not always supplied. - for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): + for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', + 'bibtex', 'ris'): value = attributes.get('types', {}).get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: @@ -442,19 +453,19 @@ class DataciteImporter(EntityImporter): if len(desc.get('description', '')) < 10: continue text = desc.get('description') - sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() lang = None if self.lang_detect: try: lang = langdetect.detect(text) except langdetect.lang_detect_exception.LangDetectException as err: - print('language detection failed: {}'.format(err), file=sys.stderr) - abstracts.append(fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", - content=text, - sha1=sha1, - lang=lang, - )) + print('language detection failed: {}'.format(err), + file=sys.stderr) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=text, + lang=lang, + )) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. @@ -476,17 +487,19 @@ class DataciteImporter(EntityImporter): ref_extra['doi'] = rel.get('relatedIdentifier') if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - index=ref_index, - extra=ref_extra, - )) + refs.append( + fatcat_openapi_client.ReleaseRef( + index=ref_index, + extra=ref_extra, + )) ref_index += 1 # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". release_stage = None - if attributes.get('state') == 'findable' or attributes.get('isActive') is True: + if attributes.get( + 'state') == 'findable' or attributes.get('isActive') is True: release_stage = 'published' # Extra information. @@ -496,8 +509,22 @@ class DataciteImporter(EntityImporter): extra_datacite['license'] = license_extra if attributes.get('subjects'): extra_datacite['subjects'] = attributes['subjects'] - if attributes.get('url'): - extra_datacite['url'] = attributes['url'] + + # Include certain relations from relatedIdentifiers. Keeping the + # original structure of data here, which is a list of dicts, with + # relation type, identifer and identifier type (mostly). + relations = [] + for rel in relIds: + if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', + 'IsVariantFormOf', 'IsSupplementTo', + 'HasVersion', 'IsMetadataFor', + 'IsNewVersionOf', 'IsIdenticalTo', + 'IsVersionOf', 'IsDerivedFrom', + 'IsSourceOf'): + relations.append(rel) + + if relations: + extra_datacite['relations'] = relations extra = dict() @@ -515,7 +542,7 @@ class DataciteImporter(EntityImporter): release_stage=release_stage, title=title, subtitle=subtitle, - original_title=title, + original_title=original_language_title, release_year=release_year, release_date=release_date, publisher=publisher, @@ -546,7 +573,7 @@ class DataciteImporter(EntityImporter): hide schema mismatch bugs. """ if self.debug is True: - print(json.dumps(re.to_dict(), default=extended_json_encoder)) + print(json.dumps(entity_to_dict(re, api_client=None))) return False # lookup existing DOI (don't need to try other ext idents for crossref) @@ -572,24 +599,15 @@ class DataciteImporter(EntityImporter): if self.insert_log_file: with open(self.insert_log_file, 'a') as f: for doc in batch: - json.dump(doc.to_dict(), f, default=extended_json_encoder) + json.dump(entity_to_dict(re, api_client=None), f) f.write('\n') - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) -def extended_json_encoder(value): - """ - Can be used with json.dumps(value, default=extended_json_encoder) to serialize - value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage - """ - if isinstance(value, (datetime.datetime, datetime.date)): - return value.isoformat() - if isinstance(value, set): - return list(value) - raise TypeError('cannot encode type: {}'.format(type(value))) def lookup_license_slug(raw): """ @@ -604,3 +622,156 @@ def lookup_license_slug(raw): if not raw.endswith('/'): raw = raw + '/' return LICENSE_SLUG_MAP.get(raw) + + +def find_original_language_title(item, min_length=4, max_questionmarks=3): + """ + Perform a few checks before returning a potential original language title. + """ + if not 'original_language_title' in item: + return None + title = item.get('title') + if not title: + return None + original_language_title = item.get('original_language_title') + if isinstance(original_language_title, + str) and title != original_language_title: + if len(original_language_title) < min_length: + return None + if original_language_title.count('?') > max_questionmarks: + return None + return original_language_title + if isinstance(original_language_title, dict): + content = original_language_title.get('__content__', '') or '' + if content and content != title and not content.count( + '?') > max_questionmarks: + return content + return None + + +def parse_datacite_titles(titles): + """ + Given a list of title items from datacite, return 3-tuple (title, + original_language_title, subtitle). + + Example input: + + [ + { + "title": "Meeting Heterogeneity in Consumer Demand" + } + ] + """ + title, original_language_title, subtitle = None, None, None + + if titles is None: + return title, original_language_title, subtitle + if len(titles) == 0: + return title, original_language_title, subtitle + elif len(titles) == 1: + original_language_title = find_original_language_title(titles[0]) + title = titles[0].get('title', '') or '' + title = title.strip() + if not title: + title = None + return title, original_language_title, subtitle + else: + for entry in titles: + if not title and ('titleType' not in entry + or not entry.get('titleType')): + title = entry.get('title').strip() + if not subtitle and entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + if not original_language_title: + original_language_title = find_original_language_title(entry) + + return title, original_language_title, subtitle + + +def parse_datacite_dates(dates): + """ + Given a list of date fields (under .dates), return tuple, (release_date, + release_year). + """ + release_date, release_year = None, None + + if not dates: + return release_date, release_year + + if not isinstance(dates, list): + raise ValueError('expected a list of date items') + + # Ignored: Collected, Issued. + date_type_prio = ( + 'Valid', + 'Available', + 'Accepted', + 'Submitted', + 'Copyrighted', + 'Created', + 'Updated', + ) + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S', '%Y') + + def parse_item(item): + result, value, year_only = None, item.get('date', ''), False + release_date, release_year = None, None + + for pattern in common_patterns: + try: + result = datetime.datetime.strptime(value, pattern) + except ValueError: + continue + else: + if pattern == '%Y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + try: + result = dateparser.parse(value) + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), + file=sys.stderr) + return result_date, result_year + + if result is None: + # Unparsable date. + return release_date, release_year + + if not year_only: + release_date = result.date() + release_year = result.year + + return release_date, release_year + + for prio in date_type_prio: + for item in dates: + if not item.get('dateType') == prio: + continue + + release_date, release_year = parse_item(item) + if release_date is None and release_year is None: + continue + + if release_year < 1000 or release_year > datetime.date.today( + ).year + 5: + # Skip possibly bogus dates. + release_year = None + continue + break + else: + continue + break + + if release_date is None and release_year is None: + for item in dates: + release_date, release_year = parse_item(item) + if release_year or release_date: + break + + return release_date, release_year diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 9c542fc6..ab67a310 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -2,10 +2,12 @@ Test datacite importer. """ +import collections import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates from fixtures import api import json @@ -22,7 +24,6 @@ def datacite_importer_existing(api): yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) - @pytest.mark.skip(reason="larger datacite import slows tests down") def test_datacite_importer_huge(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index @@ -35,6 +36,179 @@ def test_datacite_importer_huge(datacite_importer): assert len(release.contribs) == 3 +def test_find_original_language_title(): + """ + Original language might be included, in various ways. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('defaults to None', {}, None), + Case('ignore unknown keys', {'broken': 'kv'}, None), + Case('just a title', {'title': 'Noise Reduction'}, None), + Case('same title should be ignored', { + 'title': 'Noise Reduction', + 'original_language_title': 'Noise Reduction' + }, None), + Case('empty subdict is ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {}, + }, None), + Case('unknown subdict keys are ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {'broken': 'kv'}, + }, None), + Case('original string', { + 'title': 'Noise Reduction', + 'original_language_title': 'Подавление шума', + }, 'Подавление шума'), + Case('language tag is ignored, since its broken', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Noise Reduction' + }, + }, None), + Case('do not care about language', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Rauschunterdrückung', + }, + }, 'Rauschunterdrückung'), + Case('ignore excessive questionmarks', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': '???? However', + }, + }, None), + ] + + for case in cases: + result = find_original_language_title(case.input) + assert result == case.result + +def test_parse_datacite_titles(): + """ + Given a list of titles, find title, original_language_title and subtitle. + Result is a 3-tuple of title, original_language_title, subtitle. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('handle None', None, (None, None, None)), + Case('empty list', [], (None, None, None)), + Case('empty item', [{}], (None, None, None)), + Case('broken keys', [{'broken': 'kv'}], (None, None, None)), + Case('title only', [{'title': 'Total carbon dioxide'}], + ('Total carbon dioxide', None, None), + ), + Case('title and subtitle', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, subtitle order does not matter', [ + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Total carbon dioxide'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, first wins', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + ], + ('Total carbon dioxide', None, None), + ), + Case('multiple titles, plus sub', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, multiple subs', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + Case('title, original same as title, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Total carbon dioxide', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original dict, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Всего углекислого газа', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + ] + + for case in cases: + result = parse_datacite_titles(case.input) + assert result == case.result, case.about + +def test_parse_datacite_dates(): + """ + Test datacite date parsing. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('None is None', None, (None, None)), + Case('empty list is None', [], (None, None)), + Case('empty item is None', [{}], (None, None)), + Case('empty item is None', [{'date': '2019'}], (None, 2019)), + Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)), + Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)), + Case('first with type', [ + {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} + ], (None, 2019)), + Case('full date', [ + {'date': '2019-12-01', 'dateType': 'Valid'}, + ], (datetime.date(2019, 12, 1), 2019)), + Case('date type prio', [ + {'date': '2000-12-01', 'dateType': 'Valid'}, + {'date': '2010-01-01', 'dateType': 'Updated'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('date type prio, Available > Updated', [ + {'date': '2010-01-01', 'dateType': 'Updated'}, + {'date': '2000-12-01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow fuzzy date formats, Available > Updated', [ + {'date': '2010', 'dateType': 'Updated'}, + {'date': '2000 Dec 01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('ignore broken date', [ + {'date': 'Febrrr 45', 'dateType': 'Updated'}, + ], (None, None)), + ] + for case in cases: + result = parse_datacite_dates(case.input) + assert result == case.result, case.about + def test_datacite_importer(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index with open('tests/files/datacite_sample.jsonl', 'r') as f: @@ -75,7 +249,7 @@ def test_datacite_dict_parse(datacite_importer): assert r.release_type == "article" assert r.release_stage == "published" assert r.license_slug == None - assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.original_title == None assert r.ext_ids.doi == "10.18730/8dym9" assert r.ext_ids.isbn13 == None assert r.language == "enc" |