diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-17 19:40:54 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-19 14:55:15 -0800 |
commit | 92db2c8bb2464db8455b61b245a007cb57f2c92f (patch) | |
tree | 53ea173f7588810c0461ae72e9703386262aea12 | |
parent | c9bfb0be4c7e38b6668f49588f2ffecee7b17912 (diff) | |
download | fatcat-92db2c8bb2464db8455b61b245a007cb57f2c92f.tar.gz fatcat-92db2c8bb2464db8455b61b245a007cb57f2c92f.zip |
implement remainder of DOAJ article importer
-rwxr-xr-x | python/fatcat_import.py | 37 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 182 | ||||
-rw-r--r-- | python/tests/import_doaj.py | 17 |
3 files changed, 168 insertions, 68 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 19cf43ec..ff6c94dc 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -256,6 +256,24 @@ def run_datacite(args): else: JsonLinePusher(dci, args.json_file).run() +def run_doaj_article(args): + dai = DoajArticleImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + do_updates=args.do_updates, + ) + if args.kafka_mode: + KafkaJsonPusher( + dai, + args.kafka_hosts, + args.kafka_env, + "api-doaj", + "fatcat-{}-import-doaj".format(args.kafka_env), + consume_batch_size=args.batch_size, + ).run() + else: + JsonLinePusher(dai, args.json_file).run() + def run_file_meta(args): # do_updates defaults to true for this importer fmi = FileMetaImporter(args.api, @@ -606,6 +624,25 @@ def main(): auth_var="FATCAT_AUTH_WORKER_DATACITE", ) + sub_doaj_article = subparsers.add_parser('doaj-article', + help="import doaj.org article metadata") + sub_doaj_article.add_argument('json_file', + help="File with JSON lines from DOAJ API (or bulk dump) to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_doaj_article.add_argument('--issn-map-file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_doaj_article.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_doaj_article.add_argument('--do-updates', + action='store_true', + help="update any pre-existing release entities") + sub_doaj_article.set_defaults( + func=run_doaj_article, + auth_var="FATCAT_AUTH_WORKER_DOAJ", + ) + sub_file_meta = subparsers.add_parser('file-meta', help="simple update-only importer for file metadata") sub_file_meta.set_defaults( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 74ac9a0e..c0e75283 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -4,17 +4,15 @@ Importer for DOAJ article-level metadata, schema v1. DOAJ API schema and docs: https://doaj.org/api/v1/docs """ -import collections +import warnings import datetime -import sys -from typing import List, Dict, Optional - -import langdetect +from typing import List, Optional import fatcat_openapi_client -from fatcat_tools.normal import clean_doi -from fatcat_tools.transforms import entity_to_dict -from fatcat_tools.importers.common import EntityImporter, clean +from fatcat_tools.normal import (clean_doi, clean_str, parse_month, + clean_orcid, detect_text_lang, parse_lang_name, parse_country_name, + clean_pmid, clean_pmcid) +from fatcat_tools.importers.common import EntityImporter # Cutoff length for abstracts. MAX_ABSTRACT_LENGTH = 2048 @@ -48,7 +46,6 @@ class DoajArticleImporter(EntityImporter): def want(self, obj): return True - def parse_record(self, obj): """ bibjson { @@ -74,14 +71,6 @@ class DoajArticleImporter(EntityImporter): title (string, optional), volume (string, optional) } - - TODO: - - release_date - - container_id - - issue (number?) - - license is article license; import as slug - - "open_access" flag in doaj_meta - - container lookup from issns ("issns" key) """ if not obj or not isinstance(obj, dict) or not 'bibjson' in obj: @@ -90,42 +79,51 @@ class DoajArticleImporter(EntityImporter): bibjson = obj['bibjson'] - title = clean(bibjson.get('title')) + title = clean_str(bibjson.get('title'), force_xml=True) if not title: self.counts['skip-title'] += 1 return False + container_name = clean_str(bibjson['journal']['title']) container_id = None - container_name = None - - volume = clean(bibjson['journal'].get('volume')) - number = clean(bibjson['journal'].get('number')) - publisher = clean(bibjson['journal'].get('publisher')) + # NOTE: 'issns' not documented in API schema + for issn in bibjson['journal']['issns']: + issnl = self.issn2issnl(issn) + if issnl: + container_id = self.lookup_issnl(self.issn2issnl(issn)) + if container_id: + # don't store container_name when we have an exact match + container_name = None + break + + volume = clean_str(bibjson['journal'].get('volume')) + # NOTE: this schema seems to use "number" as "issue number" + issue = clean_str(bibjson['journal'].get('number')) + publisher = clean_str(bibjson['journal'].get('publisher')) try: release_year = int(bibjson.get('year')) except (TypeError, ValueError): release_year = None - # XXX: parse_month - release_month = clean(bibjson.get('year')) + release_month = parse_month(clean_str(bibjson.get('month'))) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None - # country - country = None - # XXX: country = parse_country(bibjson['journal'].get('country')) - - # language + license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) + country = parse_country_name(bibjson['journal'].get('country')) language = None - # XXX: language = parse_language(bibjson['journal'].get('language')) + for raw in bibjson['journal'].get('language') or []: + language = parse_lang_name(raw) + if language: + break # pages - # TODO: error in API docs? seems like start_page not under 'journal' object - start_page = clean(bibjson['journal'].get('start_page')) or clean(bibjson.get('start_page')) - end_page = clean(bibjson['journal'].get('end_page')) or clean(bibjson.get('end_page')) + # NOTE: error in API docs? seems like start_page not under 'journal' object + start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) + end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" @@ -136,13 +134,13 @@ class DoajArticleImporter(EntityImporter): ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) contribs = self.doaj_contribs(bibjson.get('author') or []) - + # DOAJ-specific extra doaj_extra = dict() if bibjson.get('subject'): doaj_extra['subject'] = bibjson.get('subject') if bibjson.get('keywords'): - doaj_extra['keywords'] = [k for k in [clean(s) for s in bibjson.get('keywords')] if k] + doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] # generic extra extra = dict() @@ -171,13 +169,12 @@ class DoajArticleImporter(EntityImporter): ext_ids=ext_ids, contribs=contribs, volume=volume, - number=number, # XXX - #issue, + issue=issue, pages=pages, language=language, abstracts=abstracts, extra=extra, - #license_slug=license_slug, + license_slug=license_slug, ) re = self.biblio_hacks(re) return re @@ -192,7 +189,7 @@ class DoajArticleImporter(EntityImporter): def try_update(self, re): - # lookup existing DOI (don't need to try other ext idents for crossref) + # lookup existing release by DOAJ article id existing = None try: existing = self.api.lookup_release(doaj=re.ext_ids.doaj) @@ -202,13 +199,62 @@ class DoajArticleImporter(EntityImporter): # doesn't exist, need to update return True - # eventually we'll want to support "updates", but for now just skip if - # entity already exists - if existing: + # then try other ext_id lookups + if not existing: + for extid_type in ('doi', 'pmid', 'pmcid'): + extid_val = re.ext_ids.__dict__[extid_type] + if not extid_val: + continue + try: + existing = self.api.lookup_release(**{extid_type: extid_val}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing: + if existing.ext_ids.doaj: + warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}" + warnings.warn(warn_str) + self.counts["skip-doaj-id-mismatch"] += 1 + return None + break + + # TODO: in the future could do fuzzy match here, eg using elasticsearch + + # create entity + if not existing: + return True + + # other logic could go here about skipping updates + if not self.do_updates or existing.ext_ids.doaj: self.counts['exists'] += 1 return False - return True + # fields to copy over for update + existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj + existing.release_type = existing.release_type or re.release_type + existing.release_stage = existing.release_stage or re.release_stage + existing.container_id = existing.container_id or re.container_id + existing.abstracts = existing.abstracts or re.abstracts + existing.extra['doaj'] = re.extra['doaj'] + existing.volume = existing.volume or re.volume + existing.issue = existing.issue or re.issue + existing.pages = existing.pages or re.pages + existing.language = existing.language or re.language + + try: + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + except fatcat_openapi_client.rest.ApiException as err: + # there is a code path where we try to update the same release + # twice in a row; if that happens, just skip + # NOTE: API behavior might change in the future? + if "release_edit_editgroup_id_ident_id_key" in err.body: + self.counts['skip-update-conflict'] += 1 + return False + else: + raise err + + return False def insert_batch(self, batch): self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( @@ -218,19 +264,13 @@ class DoajArticleImporter(EntityImporter): entity_list=batch)) def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: - text = clean(bibjson['abstract']) + text = clean_str(bibjson.get('abstract')) if not text or len(text) < 10: return [] if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. This is fuzzy and may be removed, if too unreliable. - lang = None - try: - lang = langdetect.detect(text) - except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: - #print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) - pass + lang = detect_text_lang(text) abstract = fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", @@ -249,15 +289,22 @@ class DoajArticleImporter(EntityImporter): } """ contribs = [] - # TODO: index? + index = 0 for author in authors: if not author.get('name'): continue + creator_id = None + orcid = clean_orcid(author.get('orcid_id')) + if orcid: + creator_id = self.lookup_orcid(orcid) contribs.append(fatcat_openapi_client.ReleaseContrib( raw_name=author.get('name'), - # XXX: orcid_id=author.get('orcid_id') or None, - # XXX: affiliation=author.get('affiliation') or None, + role='author', + index=index, + creator_id=creator_id, + raw_affiliation=clean_str(author.get('affiliation')), )) + index += 1 return contribs def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: @@ -277,9 +324,9 @@ class DoajArticleImporter(EntityImporter): if id_obj['type'].lower() == 'doi': doi = clean_doi(id_obj['id']) elif id_obj['type'].lower() == 'pmid': - pmid = id_obj['id'] + pmid = clean_pmid(id_obj['id']) elif id_obj['type'].lower() == 'pmcid': - pmcid = id_obj['id'] + pmcid = clean_pmcid(id_obj['id']) return fatcat_openapi_client.ReleaseExtIds( doaj=doaj_article_id, @@ -287,3 +334,24 @@ class DoajArticleImporter(EntityImporter): pmid=pmid, pmcid=pmcid, ) + + def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]: + """ + bibjson.journal.license { + open_access (boolean, optional), + title (string, optional), + type (string, optional), + url (string, optional), + version (string, optional) + } + """ + if not license_list: + return None + for license in license_list: + if not license.get('open_access'): + continue + slug = license.get('type') + if slug.startswith('CC '): + slug = slug.replace('CC ', 'cc-').lower() + return slug + return None diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py index a75b574e..bceb1343 100644 --- a/python/tests/import_doaj.py +++ b/python/tests/import_doaj.py @@ -60,7 +60,7 @@ def test_doaj_dict_parse(doaj_importer): assert r.publisher == "Elsevier" assert r.release_type == "article-journal" assert r.release_stage == "published" - # XXX: assert r.license_slug == "cc-by-nc-nd" + assert r.license_slug == "cc-by-nc-nd" assert r.original_title == None assert r.ext_ids.doi == "10.1016/j.matdes.2016.06.110" assert r.ext_ids.doaj == "e58f08a11ecb495ead55a44ad4f89808" @@ -71,9 +71,9 @@ def test_doaj_dict_parse(doaj_importer): assert r.number == None assert r.pages == "608-617" assert r.version == None - # XXX: assert r.language == "en" + assert r.language == "en" # matched by ISSN, so wouldn't be defined normally - # XXX: assert r.extra['container_name'] == "Materials & Design" + assert r.extra['container_name'] == "Materials & Design" assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 1033 assert len(r.contribs) == 5 @@ -82,11 +82,6 @@ def test_doaj_dict_parse(doaj_importer): assert r.contribs[0].surname == None assert not r.refs - print(r.extra) - # XXX: assert r.extra['release_month'] == 10 - # XXX: assert r.extra['country'] == 'gb' - - #assert r.extra["doaj"]["subjects"] == [ - # {"subject": "Plant Genetic Resource for Food and Agriculture"} - #] - + #print(r.extra) + assert r.extra['release_month'] == 10 + assert r.extra['country'] == 'gb' |