diff options
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 824 |
1 files changed, 444 insertions, 380 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a06c68a4..4c174b0b 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { - 'Journal': 'journal', - 'Series': 'journal', - 'Book Series': 'book-series', + "Journal": "journal", + "Series": "journal", + "Book Series": "book-series", } # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. DATACITE_TYPE_MAP = { - 'ris': { - 'THES': 'thesis', - 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) - 'CHAP': 'chapter', - 'FIGURE': 'figure', - 'RPRT': 'report', - 'JOUR': 'article-journal', - 'MPCT': 'motion_picture', - 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset - 'BOOK': 'book', - 'DATA': 'dataset', - 'COMP': 'software', + "ris": { + "THES": "thesis", + "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) + "CHAP": "chapter", + "FIGURE": "figure", + "RPRT": "report", + "JOUR": "article-journal", + "MPCT": "motion_picture", + "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset + "BOOK": "book", + "DATA": "dataset", + "COMP": "software", }, - 'schemaOrg': { - 'Dataset': 'dataset', - 'Book': 'book', - 'ScholarlyArticle': 'article-journal', - 'ImageObject': 'graphic', - 'Collection': None, - 'MediaObject': None, - 'Event': None, - 'SoftwareSourceCode': 'software', - 'Chapter': 'chapter', - 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - 'PublicationIssue': 'article', - 'AudioObject': None, - 'Thesis': 'thesis', + "schemaOrg": { + "Dataset": "dataset", + "Book": "book", + "ScholarlyArticle": "article-journal", + "ImageObject": "graphic", + "Collection": None, + "MediaObject": None, + "Event": None, + "SoftwareSourceCode": "software", + "Chapter": "chapter", + "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + "PublicationIssue": "article", + "AudioObject": None, + "Thesis": "thesis", }, - 'citeproc': { - 'article': 'article', - 'article-journal': 'article-journal', - 'article-magazine': 'article-magazine', - 'article-newspaper': 'article-newspaper', - 'bill': 'bill', - 'book': 'book', - 'broadcast': 'broadcast', - 'chapter': 'chapter', - 'dataset': 'dataset', - 'entry-dictionary': 'entry-dictionary', - 'entry-encyclopedia': 'entry-encyclopedia', - 'entry': 'entry', - 'figure': 'figure', - 'graphic': 'graphic', - 'interview': 'interview', - 'legal_case': 'legal_case', - 'legislation': 'legislation', - 'manuscript': 'manuscript', - 'map': 'map', - 'motion_picture': 'motion_picture', - 'musical_score': 'musical_score', - 'pamphlet': 'pamphlet', - 'paper-conference': 'paper-conference', - 'patent': 'patent', - 'personal_communication': 'personal_communication', - 'post': 'post', - 'post-weblog': 'post-weblog', - 'report': 'report', - 'review-book': 'review-book', - 'review': 'review', - 'song': 'song', - 'speech': 'speech', - 'thesis': 'thesis', - 'treaty': 'treaty', - 'webpage': 'webpage', + "citeproc": { + "article": "article", + "article-journal": "article-journal", + "article-magazine": "article-magazine", + "article-newspaper": "article-newspaper", + "bill": "bill", + "book": "book", + "broadcast": "broadcast", + "chapter": "chapter", + "dataset": "dataset", + "entry-dictionary": "entry-dictionary", + "entry-encyclopedia": "entry-encyclopedia", + "entry": "entry", + "figure": "figure", + "graphic": "graphic", + "interview": "interview", + "legal_case": "legal_case", + "legislation": "legislation", + "manuscript": "manuscript", + "map": "map", + "motion_picture": "motion_picture", + "musical_score": "musical_score", + "pamphlet": "pamphlet", + "paper-conference": "paper-conference", + "patent": "patent", + "personal_communication": "personal_communication", + "post": "post", + "post-weblog": "post-weblog", + "report": "report", + "review-book": "review-book", + "review": "review", + "song": "song", + "speech": "speech", + "thesis": "thesis", + "treaty": "treaty", + "webpage": "webpage", }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - 'bibtex': { - 'phdthesis': 'thesis', - 'inbook': 'chapter', - 'misc': None, - 'article': 'article-journal', - 'book': 'book', + "bibtex": { + "phdthesis": "thesis", + "inbook": "chapter", + "misc": None, + "article": "article-journal", + "book": "book", }, - 'resourceTypeGeneral': { - 'Image': 'graphic', - 'Dataset': 'dataset', - 'PhysicalObject': None, - 'Collection': None, - 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" - 'Sound': None, - 'InteractiveResource': None, - 'Event': None, - 'Software': 'software', - 'Other': None, - 'Workflow': None, - 'Audiovisual': None, - } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 + "resourceTypeGeneral": { + "Image": "graphic", + "Dataset": "dataset", + "PhysicalObject": None, + "Collection": None, + "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" + "Sound": None, + "InteractiveResource": None, + "Event": None, + "Software": "software", + "Other": None, + "Workflow": None, + "Audiovisual": None, + }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS = ( - '(:unac)', # temporarily inaccessible - '(:unal)', # unallowed, suppressed intentionally - '(:unap)', # not applicable, makes no sense - '(:unas)', # value unassigned (e.g., Untitled) - '(:unav)', # value unavailable, possibly unknown - '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) - '(:none)', # never had a value, never will - '(:null)', # explicitly and meaningfully empty - '(:tba)', # to be assigned or announced later - '(:etal)', # too numerous to list (et alia) + "(:unac)", # temporarily inaccessible + "(:unal)", # unallowed, suppressed intentionally + "(:unap)", # not applicable, makes no sense + "(:unas)", # value unassigned (e.g., Untitled) + "(:unav)", # value unavailable, possibly unknown + "(:unkn)", # known to be unknown (e.g., Anonymous, Inconnue) + "(:none)", # never had a value, never will + "(:null)", # explicitly and meaningfully empty + "(:tba)", # to be assigned or announced later + "(:etal)", # too numerous to list (et alia) ) # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( - 'NA', - 'NN', - 'n.a.', - '[s.n.]', - 'Unknown', -))) +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( + set( + ( + "NA", + "NN", + "n.a.", + "[s.n.]", + "Unknown", + ) + ) +) # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) @@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi DATACITE_TITLE_SPAM_WORDGROUPS = [ { - "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', - 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), + "tokens": ( + "full", + "movies", + "movie", + "watch", + "streaming", + "online", + "free", + "hd", + "download", + "english", + "subtitle", + "bluray", + ), "min": 4, } ] @@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter): """ Importer for datacite records. """ - def __init__(self, - api, - issn_map_file, - debug=False, - insert_log_file=None, - **kwargs): + + def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of Datacite DOI metadata, harvested from REST API" + "editgroup_description", + "Automated import of Datacite DOI metadata, harvested from REST API", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DataciteImporter') - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - - self.create_containers = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter") + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs + ) + + self.create_containers = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter): self.insert_log_file = insert_log_file self.this_year = datetime.datetime.now().year - print('datacite with debug={}'.format(self.debug), file=sys.stderr) + print("datacite with debug={}".format(self.debug), file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers referring to the same things as the given DOI. """ if self.extid_map_db is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter): """ if not obj or not isinstance(obj, dict): return None - if 'attributes' not in obj: + if "attributes" not in obj: return None - attributes = obj['attributes'] - doi = clean_doi(attributes.get('doi', '').lower()) + attributes = obj["attributes"] + doi = clean_doi(attributes.get("doi", "").lower()) if not doi: - print('skipping record without a DOI', file=sys.stderr) + print("skipping record without a DOI", file=sys.stderr) return if not str.isascii(doi): - print('[{}] skipping non-ascii doi for now'.format(doi)) + print("[{}] skipping non-ascii doi for now".format(doi)) return None - creators = attributes.get('creators', []) or [] - contributors = attributes.get('contributors', []) or [] # Much fewer than creators. + creators = attributes.get("creators", []) or [] + contributors = attributes.get("contributors", []) or [] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) @@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter): # Related: https://guide.fatcat.wiki/entity_release.html -- role # (string, of a set): the type of contribution, from a controlled # vocabulary. TODO: vocabulary needs review. - contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + contribs_extra_contributors = self.parse_datacite_creators( + contributors, set_index=False, doi=doi + ) # Unfortunately, creators and contributors might overlap, refs GH59. for cc in contribs_extra_contributors: @@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter): # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" - titles = attributes.get('titles', []) or [] - title, original_language_title, subtitle = parse_datacite_titles( - titles) + titles = attributes.get("titles", []) or [] + title, original_language_title, subtitle = parse_datacite_titles(titles) if title is None: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False # check for blocklisted "spam", e.g. "FULL MOVIE" @@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter): # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( - attributes.get('dates', [])) + attributes.get("dates", []) + ) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_date = None release_month = None release_year = None @@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter): # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + release_date, release_month, release_year = parse_single_date( + attributes.get("publicationYear") + ) if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('published')) + release_date, release_month, release_year = parse_single_date( + attributes.get("published") + ) if not any((release_date, release_month, release_year)): - print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = 'published' + release_stage = "published" # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. - publisher = attributes.get('publisher') + publisher = attributes.get("publisher") - if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): + if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: @@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter): container_id = None container_name = None - container = attributes.get('container', {}) or {} - if container.get('type') in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container['type']) - if container.get('identifier') and container.get( - 'identifierType') == 'ISSN': - issn = container.get('identifier') + container = attributes.get("container", {}) or {} + if container.get("type") in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("identifier") and container.get("identifierType") == "ISSN": + issn = container.get("identifier") if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) - if container_id is None and container.get('title'): - container_name = container.get('title') + if container_id is None and container.get("title"): + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( @@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter): else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 - container_name = container.get('title') + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: - if publisher and publisher.lower().startswith('micropublication'): + if publisher and publisher.lower().startswith("micropublication"): container_name = publisher # Volume and issue. - volume = container.get('volume') - issue = container.get('issue') + volume = container.get("volume") + issue = container.get("issue") if volume: volume = clean(volume) @@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter): # Pages. pages = None - first_page = container.get('firstPage') - last_page = container.get('lastPage') + first_page = container.get("firstPage") + last_page = container.get("lastPage") if first_page and last_page: try: _ = int(first_page) < int(last_page) - pages = '{}-{}'.format(first_page, last_page) + pages = "{}-{}".format(first_page, last_page) except ValueError as err: # noqa: F841 # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) @@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter): license_slug = None license_extra = [] - for lic in attributes.get('rightsList', []): - slug = lookup_license_slug(lic.get('rightsUri')) + for lic in attributes.get("rightsList", []): + slug = lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter): # library solves it for you." -- TODO(martin): We need more of these. language = None - value = attributes.get('language', '') or '' + value = attributes.get("language", "") or "" try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: # noqa: F841 @@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter): # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] - descs = attributes.get('descriptions', []) or [] + descs = attributes.get("descriptions", []) or [] for desc in descs: - if not desc.get('descriptionType') == 'Abstract': + if not desc.get("descriptionType") == "Abstract": continue # Description maybe a string, int or list. - text = desc.get('description', '') + text = desc.get("description", "") if not text: continue if isinstance(text, int): - text = '{}'.format(text) + text = "{}".format(text) if isinstance(text, list): try: text = "\n".join(text) except TypeError: - continue # Bail out, if it is not a list of strings. + continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: @@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter): try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: - print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + print( + "[{}] language detection failed with {} on {}".format(doi, err, text), + file=sys.stderr, + ) abstract_text = clean(text) if not abstract_text: continue @@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter): mimetype="text/plain", content=abstract_text, lang=lang, - )) + ) + ) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. @@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter): # For the moment, we only care about References. refs, ref_index = [], 0 - relIds = attributes.get('relatedIdentifiers', []) or [] + relIds = attributes.get("relatedIdentifiers", []) or [] for rel in relIds: - if not rel.get('relationType', '') in ('References', 'Cites'): + if not rel.get("relationType", "") in ("References", "Cites"): continue ref_extra = dict() - if rel.get('relatedIdentifierType', '') == 'DOI': - ref_extra['doi'] = rel.get('relatedIdentifier') + if rel.get("relatedIdentifierType", "") == "DOI": + ref_extra["doi"] = rel.get("relatedIdentifier") if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, - )) + ) + ) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: - if rel.get('relatedIdentifierType', '') != 'Reviews': + if rel.get("relatedIdentifierType", "") != "Reviews": continue - release_type = 'review' + release_type = "review" # Extra information. extra_datacite = dict() if license_extra: - extra_datacite['license'] = license_extra - if attributes.get('subjects'): - extra_datacite['subjects'] = attributes['subjects'] + extra_datacite["license"] = license_extra + if attributes.get("subjects"): + extra_datacite["subjects"] = attributes["subjects"] # Include version information. - metadata_version = attributes.get('metadataVersion') or '' + metadata_version = attributes.get("metadataVersion") or "" if metadata_version: - extra_datacite['metadataVersion'] = metadata_version + extra_datacite["metadataVersion"] = metadata_version # Include resource types. - types = attributes.get('types', {}) or {} - resource_type = types.get('resourceType', '') or '' - resource_type_general = types.get('resourceTypeGeneral', '') or '' + types = attributes.get("types", {}) or {} + resource_type = types.get("resourceType", "") or "" + resource_type_general = types.get("resourceTypeGeneral", "") or "" if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceType'] = resource_type + extra_datacite["resourceType"] = resource_type if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceTypeGeneral'] = resource_type_general + extra_datacite["resourceTypeGeneral"] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifier and identifier type (mostly). relations = [] for rel in relIds: - if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', - 'IsVariantFormOf', 'IsSupplementTo', - 'HasVersion', 'IsMetadataFor', - 'IsNewVersionOf', 'IsIdenticalTo', - 'IsVersionOf', 'IsDerivedFrom', - 'IsSourceOf'): + if rel.get("relationType") in ( + "IsPartOf", + "Reviews", + "Continues", + "IsVariantFormOf", + "IsSupplementTo", + "HasVersion", + "IsMetadataFor", + "IsNewVersionOf", + "IsIdenticalTo", + "IsVersionOf", + "IsDerivedFrom", + "IsSourceOf", + ): relations.append(rel) if relations: - extra_datacite['relations'] = relations + extra_datacite["relations"] = relations extra = dict() @@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter): # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" - version = attributes.get('version') or None + version = attributes.get("version") or None # top-level extra keys if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name # Always include datacite key, even if value is empty (dict). - extra['datacite'] = extra_datacite + extra["datacite"] = extra_datacite # Preparation for a schema update. if release_month: - extra['release_month'] = release_month + extra["release_month"] = release_month extids = self.lookup_ext_ids(doi=doi) @@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, @@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter): """ release_type = None - if not attributes.get('types'): + if not attributes.get("types"): return None - types = attributes['types'] + types = attributes["types"] - for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"): value = types.get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break # special case: figshare "collections" which group other entities - if doi.startswith('10.6084/') or doi.startswith('10.25384'): - if types.get('resourceType') == "Collection": + if doi.startswith("10.6084/") or doi.startswith("10.25384"): + if types.get("resourceType") == "Collection": release_type = "stub" if release_type is None: @@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter): # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). - if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): - re.release_type = 'stub' + if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."): + re.release_type = "stub" # release_type exception: lots of "Experimental Crystal Structure Determination" # publisher: "Cambridge Crystallographic Data Centre" - if re.ext_ids.doi.startswith('10.5517/'): - re.release_type = 'entry' + if re.ext_ids.doi.startswith("10.5517/"): + re.release_type = "entry" # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." - if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): - re.release_type = 'component' + if re.title.lower().startswith("additional file") and re.release_type in ( + "article", + "article-journal", + ): + re.release_type = "component" # figshare - if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): + if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"): # set version if DOI ends with versioned suffix - doi_suffix = re.ext_ids.doi.split('.')[-1] - if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): + doi_suffix = re.ext_ids.doi.split(".")[-1] + if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" - if " from " in re.title and re.release_type not in ('stub', 'graphic'): + if " from " in re.title and re.release_type not in ("stub", "graphic"): if re.title.startswith("Figure "): re.release_type = "component" elif re.title.startswith("Table "): re.release_type = "component" # figshare.com - if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: - re.extra['container_name'] = "figshare.com" + if ( + re.ext_ids.doi.startswith("10.6084/m9.figshare.") + and re.extra.get("container_name") is None + ): + re.extra["container_name"] = "figshare.com" return re @@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + print("inserting batch ({})".format(len(batch)), file=sys.stderr) if self.insert_log_file: - with open(self.insert_log_file, 'a') as f: + with open(self.insert_log_file, "a") as f: for doc in batch: json.dump(entity_to_dict(doc, api_client=None), f) - f.write('\n') + f.write("\n") self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) - def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): + def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None): """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. @@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blocklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(("Occdownload Gbif.Org",)) i = 0 for c in creators: if not set_index: i = None - nameType = c.get('nameType', '') or '' - if nameType in ('', 'Personal'): + nameType = c.get("nameType", "") or "" + if nameType in ("", "Personal"): creator_id = None - for nid in c.get('nameIdentifiers', []) or []: + for nid in c.get("nameIdentifiers", []) or []: if not isinstance(nid, dict): # see: fatcat-workers/issues/44035/ - print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr) + print( + "unexpected nameIdentifiers, expected list of dicts, got: {}".format( + nid + ), + file=sys.stderr, + ) continue - name_scheme = nid.get('nameIdentifierScheme', '') or '' + name_scheme = nid.get("nameIdentifierScheme", "") or "" if not name_scheme.lower() == "orcid": continue - orcid = nid.get('nameIdentifier') or '' - orcid = orcid.replace('https://orcid.org/', '') + orcid = nid.get("nameIdentifier") or "" + orcid = orcid.replace("https://orcid.org/", "") if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. - affiliations = c.get('affiliation', []) or [] + affiliations = c.get("affiliation", []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean(affiliations[0]) - name = c.get('name') - given_name = c.get('givenName') - surname = c.get('familyName') + name = c.get("name") + given_name = c.get("givenName") + surname = c.get("familyName") if name: name = clean(name) if not any((name, given_name, surname)): continue if not name: - name = "{} {}".format(given_name or '', surname or '').strip() + name = "{} {}".format(given_name or "", surname or "").strip() if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter): if not name: continue - if raw_affiliation == '': + if raw_affiliation == "": continue extra = None @@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter): # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" - contributorType = c.get('contributorType', '') or '' + contributorType = c.get("contributorType", "") or "" if contributorType: - extra = {'type': contributorType} + extra = {"type": contributorType} rc = fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=name, - given_name=given_name, - surname=surname, - role=role, - raw_affiliation=raw_affiliation, - extra=extra, - ) + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + ) # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) if i is not None: i += 1 - elif nameType == 'Organizational': - name = c.get('name', '') or '' + elif nameType == "Organizational": + name = c.get("name", "") or "" if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue - extra = {'organization': name} - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, extra=extra)) + extra = {"organization": name} + contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra)) if i is not None: i += 1 else: - print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr) return contribs @@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor): for cc in contributor_list: if cc.raw_name != contributor.raw_name: continue - cc_role = cc.role or 'author' - contributor_role = contributor.role or 'author' + cc_role = cc.role or "author" + contributor_role = contributor.role or "author" if cc_role != contributor_role: continue return True @@ -952,91 +1007,97 @@ def lookup_license_slug(raw): if not raw: return None - if 'creativecommons.org/publicdomain/zero' in raw: - return 'CC-0' - if raw.lower().endswith('/cc0'): - return 'CC-0' + if "creativecommons.org/publicdomain/zero" in raw: + return "CC-0" + if raw.lower().endswith("/cc0"): + return "CC-0" - if 'creativecommons' in raw: + if "creativecommons" in raw: # https://creativecommons.org/publicdomain/mark/1.0/deed.de - if 'creativecommons.org/publicdomain' in raw: - return 'CC-PUBLICDOMAIN' - if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: - return 'CC-0' + if "creativecommons.org/publicdomain" in raw: + return "CC-PUBLICDOMAIN" + if "creativecommons.org/share-your-work/public-domain/cc0" in raw: + return "CC-0" # https://creativecommons.org/licenses/by/4.0/deed.es_ES raw = raw.lower() - match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE) + match = re.search( + r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None - if not name.startswith('cc'): - name = 'cc-{}'.format(name) + if not name.startswith("cc"): + name = "cc-{}".format(name) return name.upper() - if 'opensource.org' in raw: + if "opensource.org" in raw: # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 - match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE) + match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 11: return None return name.upper() - if 'gnu.org' in raw: + if "gnu.org" in raw: # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html - match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) + match = re.search( + r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)", + raw, + re.IGNORECASE, + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 8: return None return name.upper() - if 'spdx.org' in raw: - if 'spdx.org/licenses/CC0' in raw: - return 'CC-0' + if "spdx.org" in raw: + if "spdx.org/licenses/CC0" in raw: + return "CC-0" # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html - match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE) + match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 36: return None # cleanup version and extensions - name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) + name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower()) return name.upper() - if 'rightsstatements.org' in raw: + if "rightsstatements.org" in raw: # http://rightsstatements.org/vocab/InC/1.0/ - match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw) + match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 9: return None - return 'RS-{}'.format(name.upper()) + return "RS-{}".format(name.upper()) # Fallback to mapped values. raw = raw.lower() - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) @@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3): Example input: {'title': 'Some title', 'original_language_title': 'Some title'} """ - if 'original_language_title' not in item: + if "original_language_title" not in item: return None - title = item.get('title') + title = item.get("title") if not title: return None - original_language_title = item.get('original_language_title') - if isinstance(original_language_title, - str) and title != original_language_title: + original_language_title = item.get("original_language_title") + if isinstance(original_language_title, str) and title != original_language_title: if len(original_language_title) < min_length: return None - if original_language_title.count('?') > max_questionmarks: + if original_language_title.count("?") > max_questionmarks: return None return original_language_title if isinstance(original_language_title, dict): - content = original_language_title.get('__content__', '') or '' - if content and content != title and not content.count( - '?') > max_questionmarks: + content = original_language_title.get("__content__", "") or "" + if content and content != title and not content.count("?") > max_questionmarks: return content return None @@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle elif len(titles) == 1: original_language_title = find_original_language_title(titles[0]) - title = titles[0].get('title', '') or '' + title = titles[0].get("title", "") or "" title = title.strip() if not title: title = None return title, original_language_title, subtitle else: for entry in titles: - if not title and ('titleType' not in entry - or not entry.get('titleType')): - title = (entry.get('title') or '').strip() - if not subtitle and entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title', '').strip() + if not title and ("titleType" not in entry or not entry.get("titleType")): + title = (entry.get("title") or "").strip() + if not subtitle and entry.get("titleType") == "Subtitle": + subtitle = entry.get("title", "").strip() if not original_language_title: original_language_title = find_original_language_title(entry) return title, original_language_title, subtitle + def parse_single_date(value): """ Given a single string containing a date in arbitrary format, try to return @@ -1113,11 +1172,11 @@ def parse_single_date(value): # Results in a dict with keys: date_obj, period, locale. parse_result = parser.get_date_data(value) # A datetime object, later we need a date, only. - result = parse_result['date_obj'] + result = parse_result["date_obj"] if result is not None: - if parse_result['period'] == 'year': + if parse_result["period"] == "year": return None, None, result.year - elif parse_result['period'] == 'month': + elif parse_result["period"] == "month": return None, result.month, result.year else: return result.date(), result.month, result.year @@ -1126,6 +1185,7 @@ def parse_single_date(value): return None, None, None + def parse_datacite_dates(dates): """ Given a list of date fields (under .dates), return tuple, (release_date, @@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year if not isinstance(dates, list): - raise ValueError('expected a list of date items') + raise ValueError("expected a list of date items") # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", # "Collected", "Updated", "Copyrighted", "Created" # Ignored for now: "Collected", "Issued" date_type_prio = ( - 'Valid', - 'Available', - 'Accepted', - 'Submitted', - 'Copyrighted', - 'Created', - 'Updated', + "Valid", + "Available", + "Accepted", + "Submitted", + "Copyrighted", + "Created", + "Updated", ) # We need to note the granularity, since a string like "2019" would be # parsed into "2019-01-01", even though the month is unknown. Use 3 # granularity types: 'y', 'm', 'd'. - Pattern = collections.namedtuple('Pattern', 'layout granularity') + Pattern = collections.namedtuple("Pattern", "layout granularity") # Before using (expensive) dateparser, try a few common patterns. common_patterns = ( - Pattern('%Y-%m-%d', 'd'), - Pattern('%Y-%m', 'm'), - Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), - Pattern('%Y-%m-%dT%H:%M:%S', 'd'), - Pattern('%Y', 'y'), + Pattern("%Y-%m-%d", "d"), + Pattern("%Y-%m", "m"), + Pattern("%Y-%m-%dT%H:%M:%SZ", "d"), + Pattern("%Y-%m-%dT%H:%M:%S", "d"), + Pattern("%Y", "y"), ) def parse_item(item): - result, value, year_only = None, str(item.get('date', '')) or '', False + result, value, year_only = None, str(item.get("date", "")) or "", False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: @@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates): except ValueError: continue else: - if granularity == 'y': + if granularity == "y": year_only = True break if result is None: - print('fallback for {}'.format(value), file=sys.stderr) + print("fallback for {}".format(value), file=sys.stderr) release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. return release_date, release_month, release_year - if granularity != 'y': + if granularity != "y": release_date = result.date() release_year = result.year - if granularity in ('m', 'd'): + if granularity in ("m", "d"): release_month = result.month return release_date, release_month, release_year @@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates): for prio in date_type_prio: for item in dates: - if not item.get('dateType') == prio: + if not item.get("dateType") == prio: continue release_date, release_month, release_year = parse_item(item) @@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year + def index_form_to_display_name(s): """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. """ - if ',' not in s: + if "," not in s: return s - skip_on_chars = ['(', ')', '*'] + skip_on_chars = ["(", ")", "*"] for char in skip_on_chars: if char in s: return s - if s.count(',') > 1: + if s.count(",") > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s # Not names, but sprinkled in fields where authors live. - stopwords = [s.lower() for s in ( - 'Archive', - 'Collection', - 'Coordinator', - 'Department', - 'Germany', - 'International', - 'National', - 'Netherlands', - 'Office', - 'Organisation', - 'Organization', - 'Service', - 'Services', - 'United States', - 'University', - 'Verein', - 'Volkshochschule', - )] + stopwords = [ + s.lower() + for s in ( + "Archive", + "Collection", + "Coordinator", + "Department", + "Germany", + "International", + "National", + "Netherlands", + "Office", + "Organisation", + "Organization", + "Service", + "Services", + "United States", + "University", + "Verein", + "Volkshochschule", + ) + ] lower = s.lower() for stop in stopwords: if stop in lower: return s - a, b = s.split(',') - return '{} {}'.format(b.strip(), a.strip()) + a, b = s.split(",") + return "{} {}".format(b.strip(), a.strip()) |