diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-09 01:03:43 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:31 +0100 |
commit | 4a82a0763bf927248f22e47ab5187af4beff83ee (patch) | |
tree | af86801bfb77a40bc8b409fa736b40c581fe970c | |
parent | 54a2c83c0a5e8ccd4eec7c18eac715bdbb3eb62e (diff) | |
download | fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.tar.gz fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.zip |
datacite: importer skeleton
* contributors, title, date, publisher, container, license
Field and value analysis via https://github.com/miku/indigo.
-rwxr-xr-x | python/fatcat_import.py | 30 | ||||
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 458 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 25 |
4 files changed, 514 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 8d82dab3..d7651792 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -166,6 +166,17 @@ def run_cdl_dash_dat(args): print("fileset id: {}".format(fs.ident)) print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) +def run_datacite(args): + dci = DataciteImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + bezerk_mode=args.bezerk_mode) + if args.kafka_mode: + KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite", + "fatcat-import", consume_batch_size=args.batch_size).run() + else: + JsonLinePusher(dci, args.json_file).run() + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -439,6 +450,25 @@ def main(): type=str, help="use existing editgroup (instead of creating a new one)") + sub_datacite = subparsers.add_parser('datacite', + help="import datacite.org metadata") + sub_datacite.add_argument('json_file', + help="File with jsonlines from datacite.org v2 API to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_datacite.add_argument('issn_map_file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_datacite.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_datacite.add_argument('--bezerk-mode', + action='store_true', + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") + sub_datacite.set_defaults( + func=run_datacite, + auth_var="FATCAT_API_AUTH_TOKEN", + ) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index bb9c5b17..d936605f 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -14,6 +14,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug +from .datacite import DataciteImporter from .jalc import JalcImporter from .jstor import JstorImporter from .arxiv import ArxivRawImporter diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py new file mode 100644 index 00000000..faa8e2be --- /dev/null +++ b/python/fatcat_tools/importers/datacite.py @@ -0,0 +1,458 @@ +""" +WIP: Importer for datacite.org data. + +Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8 +""" + +from .common import EntityImporter +import dateparser +import datetime +import fatcat_openapi_client +import json +import sys + +# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary +CONTAINER_TYPE_MAP = { + 'Journal': 'journal', + 'Series': 'journal', + 'Book Series': 'book-series', +} + +# TODO(martin): merge this with other maps, maybe. +LICENSE_SLUG_MAP = { + "//creativecommons.org/licenses/by/2.0": "CC-BY", + "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY", + "//creativecommons.org/licenses/by/3.0": "CC-BY", + "//creativecommons.org/licenses/by/3.0/us": "CC-BY", + "//creativecommons.org/licenses/by/4.0": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.de": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.en_US": "CC-BY", + "//creativecommons.org/licenses/by/4.0/legalcode": "CC-BY", + "//creativecommons.org/licenses/by-nc/2.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA", + "//creativecommons.org/licenses/CC-BY/4.0": "CC-BY", + "//creativecommons.org/licenses/publicdomain/zero/1.0": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//opensource.org/licenses/MIT": "MIT", + "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", + "//www.karger.com/Services/SiteLicenses": "KARGER", + "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", + "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", + "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2 + "//www.opensource.org/licenses/MIT": "MIT", + # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ + # "http://rsc.li/journals-terms-of-use": "RSC", + # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG. + # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "", + # "http://www.springer.com/tdm": "", + # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html + # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "", + # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "", + # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "", + # "https://rightsstatements.org/page/InC/1.0?language=en": "", + # "https://services.ceda.ac.uk/cedasite/register/info": "", + # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404 + # "https://www.cambridge.org/core/terms": "", + # "https://www.elsevier.com/tdm/userlicense/1.0", + # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights + # "info:eu-repo/semantics/embargoedAccess": "", + # "info:eu-repo/semantics/openAccess": "", +} + +class DataciteImporter(EntityImporter): + """ + Importer for datacite records. TODO(martin): Do we need issn_map_file? + """ + + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of Datacite DOI metadata, harvested from REST API") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers', True) + self.read_issn_map_file(issn_map_file) + + def parse_record(self, obj): + """ + TODO(martin): Map datacite to RE. + + WIP, notes: + + * Many subjects, should they end up in extra? + * attributes.creators and attributes.contributors + + $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr + 3963663 "Personal" + 289795 null + 8892 "Organizational" + + Shall we use issued, available? + + { + "date": "2011-11-18", + "dateType": "Accepted" + }, + { + "date": "2011-11-18", + "dateType": "Available" + }, + { + "date": "2011-11-07", + "dateType": "Copyrighted" + }, + { + "date": "2011-11-18", + "dateType": "Issued" + }, + { + "date": "2011-11-07", + "dateType": "Issued" + } + + TODO(martin): Quick analysis of dates and stages. + """ + + if 'attributes' not in obj: + return None + + attributes = obj['attributes'] + + # > Contributors + # + # "attributes.creators[].contributorType": [ + # "author" + # ], + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ + # "LCNA", + # "GND", + # "email", + # "NAF", + # "OSF", + # "RRID", + # "ORCID", + # "SCOPUS", + # "NRCPID", + # "schema.org", + # "GRID", + # "MGDS", + # "VIAF", + # "JACoW-ID" + # ], + # + # "https://orcid.org/0000-0002-9902-738X", + # "http://jacow.org/JACoW-00001280", + # "Wiebe_Peter", + # "https://osf.io/https://osf.io/kjfuy/", + # "http://www.viaf.org176549220", + # "2239", + # "Jeffries_Martin", + # "https://orcid.org/0000-0002-1493-6630", + # "0000-0002-6233-612X", + # + # "creators": [ + # { + # "name": "Bögli, Hans", + # "nameType": "Personal", + # "givenName": "Hans", + # "familyName": "Bögli", + # "affiliation": [] + # } + # ], + + contribs = [] + + for i, c in enumerate(attributes['creators']): + if not c.get('nameType') == 'Personal': + continue + creator_id = None + for nid in c.get('nameIdentifiers', []): + if not nid.get('nameIdentifierScheme').lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # If creator_id is None, should we create creators? + contribs.append(fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=c.get('name'), + given_name=c.get('givenName'), + surname=c.get('familyName'), + )) + + # > Title + # + # "attributes.titles[].titleType": [ + # "AlternativeTitle", + # "Other", + # "Subtitle", + # null, + # "TranslatedTitle" + # ], + title, subtitle = None, None + + for entry in attributes.get('titles', []): + if not title and 'titleType' not in entry: + title = entry.get('title').strip() + if entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title').strip() + + # > Dates + # + # "attributes.dates[].dateType": [ + # "Accepted", + # "Available" + # "Collected", + # "Copyrighted", + # "Created", + # "Issued", + # "Submitted", + # "Updated", + # "Valid", + # ], + # + # Different documents have different dates defined. Choose the topmost + # available from prio list. + date_type_prio = ( + 'Valid', + 'Issued', + 'Available', + 'Accepted', + 'Submitted', + 'Copyrighted', + 'Collected', + 'Created', + 'Updated', + ) + + release_year, release_date = None, None + for prio in date_type_prio: + dates = attributes.get('dates', []) or [] # Never be None. + for item in dates: + if not item.get('dateType') == prio: + continue + result = dateparser.parse(item.get('date')) + if result is None: + # Unparsable date. + continue + release_date = result + release_year = result.year + if 1000 < release_year < datetime.date.today().year + 5: + # Skip possibly bogus dates. + continue + break + else: + continue + break + + # > Publisher + # + # A few NA values. A few bogus values. + # + publisher = attributes.get('publisher') + + if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'): + publisher = None + if publisher is not None and len(publisher) > 80: + # Arbitrary magic value, TODO(martin): better heuristic. + # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, + # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt, + # Nutzungsrechte müssen durch den Nutzer abgeklärt werden", + # TODO(martin): log misses. + publisher = None + + # > Container + # + # For the moment, only ISSN as container. + # + # "container": { + # "type": "Journal", + # "issue": "8", + # "title": "Angewandte Chemie International Edition", + # "volume": "57", + # "lastPage": "2080", + # "firstPage": "2077", + # "identifier": "14337851", + # "identifierType": "ISSN" + # }, + # + # "attributes.container.type": [ + # "DataRepository", + # "Journal", + # "Series", + # "Book Series" + # ], + # + # "attributes.container.identifierType": [ + # "Handle", + # "ISBN", + # "LISSN", + # "DOI", + # "EISSN", + # "URL", + # "ISSN" + # ], + + container_id = None + container = attributes.get('container', {}) or {} + if container.get('type') in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container['type']) + if container.get('identifier') and container.get('identifierType') == 'ISSN': + issn = container.get('identifier') + if len(issn) == 8: + issn = issn[:4] + "-" + issn[4:] + issnl = self.issn2issnl(issn) + container_id = self.lookup_issnl(issnl) + + if container_id is None and container.get('title'): + ce = fatcat_openapi_client.ContainerEntity( + issnl=issnl, + container_type=container_type, + name=container.get('title'), + ) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + self._issnl_id_map[issnl] = container_id + + # > License + # + # attributes.rightsList[].rightsUri + # attributes.rightsList[].rights + # attributes.rightsList[].lang + # + + license_slug = None + license_extra = [] + for l in attributes.get('rightsList', []): + slug = lookup_license_slug(l.get('rightsUri')) + if slug: + license_slug = slug + license_extra.append(l) + + # > Release type. + # + # Datacite has some fine granular typing (e.g. "Supplementary + # Collection of Datasets", "Taxonomic treatment", "blog_entry", ... + # + # Additional, coarse: resourceTypeGeneral + # + # "attributes.types.resourceTypeGeneral": [ + # "Image", + # "Dataset", + # "PhysicalObject", + # "Collection", + # "Text", + # "Sound", + # "InteractiveResource", + # "Event", + # "Software", + # "Other", + # "Workflow", + # "Audiovisual" + # ], + + # > Extra information. + extra, extra_datacite = dict(), dict() + if license_extra: + extra_datacite['license'] = license_extra + + if extra_datacite: + extra['datacite'] = extra_datacite + + # https://guide.fatcat.wiki/entity_release.html + re = fatcat_openapi_client.ReleaseEntity( + work_id=None, + container_id=container_id, + release_type=None, + release_stage=None, + title=title, # attributes.titles, various titleType + subtitle=subtitle, + original_title=title, # AlternativeTitle? + release_year=release_year, # publicationYear + release_date=release_date, # date issues/available? + publisher=publisher, # attributes.publisher + ext_ids=fatcat_openapi_client.ReleaseExtIds( + doi=attributes.get('doi'), # attributes.doi, + # Can we add handle.net link? + ), + contribs=contribs, + volume=None, + issue=None, + pages=None, + language=None, + abstracts=None, + refs=None, + extra=extra, + license_slug=license_slug, + ) + return re + + def try_update(self, re, debug=True): + if debug is True: + # print(type(re)) + print(json.dumps(re.to_dict(), default=extended_encoder)) + return + return False + + def insert_batch(self, batch): + # Debugging. + for item in batch: + print(item) + return + + # Orig. + self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + +def extended_encoder(value): + """ + Can be used with json.dumps(value, default=extended_encoder) to serialize + value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage + """ + if isinstance(value, (datetime.datetime, datetime.date)): + return value.isoformat() + if isinstance(value, set): + return list(value) + +def lookup_license_slug(raw): + """ + TODO(martin): reuse from crossref, maybe. + """ + if not raw: + return None + raw = raw.strip().replace('http://', '//').replace('https://', '//') + if 'creativecommons.org' in raw.lower(): + raw = raw.lower() + raw = raw.replace('/legalcode', '/').replace('/uk', '') + if not raw.endswith('/'): + raw = raw + '/' + return LICENSE_SLUG_MAP.get(raw) diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py new file mode 100644 index 00000000..0bbaba2e --- /dev/null +++ b/python/tests/import_datacite.py @@ -0,0 +1,25 @@ +""" +Test datacite importer. + +Datacite is a aggregator, hence inputs are quite varied. + +Here is small sample of ID types taken from a sample: + + 497344 "DOI" + 65013 "URL" + 22210 "CCDC" + 17853 "GBIF" + 17635 "Other" + 11474 "uri" + 9170 "Publisher ID" + 7775 "URN" + 6196 "DUCHAS" + 5624 "Handle" + 5056 "publisherId" + +A nice tool, not yet existing tool (maybe named indigo) would do the following: + + $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md + +TODO(martin): Write tests. +""" |